In [None]:
import io
import os
import time
import hashlib
import operator
from collections import Counter
from collections import namedtuple
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from skimage.util import montage
import cv2
from cv2 import img_hash
import torch

from utils import overlay_tag_pairs
from utils import overlay_tag_maps
from utils import generate_overlay_tag_slices
from utils import get_tile
from utils import get_overlay_score
from utils import get_tile_scores
from utils import get_entropy_score
from utils import gen_pixel_scores
from utils import channel_shift
from utils import read_duplicate_truth
from utils import update_duplicate_truth
from utils import read_image_duplicate_tiles
from utils import write_image_duplicate_tiles
from utils import read_image_image_duplicate_tiles
from utils import update_image_image_duplicate_tiles
from utils import generate_overlay_tag_nines_mask

from test_friend_circles import SDCImageContainer

from dupnet import load_checkpoint

%matplotlib inline
%reload_ext autoreload
%autoreload 2

EPS = np.finfo(np.float32).eps

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16
BIGGEST_SIZE = 20
plt.rc('font', size=BIGGEST_SIZE)         # controls default text sizes
plt.rc('axes', titlesize=BIGGEST_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=BIGGEST_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

montage_rgb = lambda x: np.stack([montage(x[:, :, :, i]) for i in range(x.shape[3])], -1)
montage_pad = lambda x, *args, **kwargs: montage(x, padding_width=10, *args, **kwargs)

ship_dir = "data/input"
train_image_dir = os.path.join(ship_dir, "train_768")
train_mask_dir = os.path.join(ship_dir, 'train_masks_768')
image_counter_grids_file = os.path.join("data", "image_counter_grids.pkl")
image_md5hash_grids_file = os.path.join("data", "image_md5hash_grids.pkl")
image_bm0hash_grids_file = os.path.join("data", "image_bm0hash_grids.pkl")
image_entropy_grids_file = os.path.join("data", "image_entropy_grids.pkl")
image_duplicate_tiles_file = os.path.join("data", "image_duplicate_tiles.txt")
image_image_duplicate_tiles_file = os.path.join("data", "image_image_duplicate_tiles.txt")
duplicate_truth_file = os.path.join('data', 'duplicate_truth.txt')

overlay_tag_slices = generate_overlay_tag_slices()

In [None]:
def get_channel_entropy(ctr, img_size=1769472):  # 768x768x3
    ctr_norm = {k: v / img_size for k, v in sorted(ctr.items())}
    ctr_entropy = {k: -v * np.log(v) for k, v in ctr_norm.items()}
    entropy = np.sum([k * v for k, v in ctr_entropy.items()])
    return entropy


def get_entropy(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).flatten())
        entropy_list.append(get_channel_entropy(ctr, img.size))
    return np.array(entropy_list)

def get_entropy1(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), 0.5, axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).astype(np.uint8).flatten())
        entropy_list.append(ctr)
    return entropy_list

def get_entropy2(img1_id, img2_id):
    entropy1_list = get_entropy1(img1_id)
    entropy2_list = get_entropy1(img2_id)
    entropy_list = []
    for ctr1, ctr2 in zip(entropy1_list, entropy2_list):
        ctr = (ctr1 - ctr2) + (ctr2 - ctr1)
        entropy_list.append(get_channel_entropy(ctr))
    return np.array(entropy_list)

def slice_from_large(img, idx, sz=256):
    tile = get_tile(img, idx, sz=sz)
    return cv2.cvtColor(tile, cv2.COLOR_BGR2RGB)

def gen_dcnn_scores(img1, img2, img1_overlay_tag, model, sz=256):    
    img1_overlay_map = overlay_tag_maps[img1_overlay_tag]
    img2_overlay_map = overlay_tag_maps[overlay_tag_pairs[img1_overlay_tag]]
    X_list = []
    for idx1, idx2 in zip(img1_overlay_map, img2_overlay_map):
        tile1 = slice_from_large(img1, idx1, sz=sz).astype(np.float32) / 255.0
        tile2 = slice_from_large(img2, idx2, sz=sz).astype(np.float32) / 255.0
        X = np.dstack([tile1, tile2])
        X = X.transpose((2, 0, 1))
        X_list.append(X)

    X_arr = np.stack(X_list)
    inputs = torch.from_numpy(X_arr)
    inputs = inputs.to(device)
    
    with torch.set_grad_enabled(False):
        val_outputs = model(inputs)

    return val_outputs[:, 0].cpu().numpy()


In [None]:
class ImgMod:
    """
    Reads a single image to be modified by hls.
    """

    def __init__(self, filename):
        self.filename = filename
        self.img_id = filename.split('/')[-1]

        self._hls_chan = None
        self._hls_gain = None

        self._parent_bgr = None
        self._parent_hls = None
        self._parent_rgb = None
        self._cv2_hls = None
        self._cv2_bgr = None
        self._cv2_rgb = None

    def channel_shift(self, chan, gain):
        self._hls_chan = chan
        self._hls_gain = gain
        self._cv2_hls = None
        return self.cv2_rgb
    
    def scale(self, minval, maxval):
        m = 255.0 * (maxval - minval)
        res = m * (self.parent_bgr - minval)
        return np.around(res).astype(np.uint8)
    
    @property
    def shape(self):
        return self.parent_bgr.shape
    
    @property
    def parent_bgr(self):
        if self._parent_bgr is None:
            self._parent_bgr = cv2.imread(self.filename)
        return self._parent_bgr

    @property
    def parent_hls(self):
        if self._parent_hls is None:
            self._parent_hls = self.to_hls(self.parent_bgr)
        return self._parent_hls

    @property
    def parent_rgb(self):
        if self._parent_rgb is None:
            self._parent_rgb = self.to_rgb(self.parent_bgr)
        return self._parent_rgb

    @property
    def cv2_hls(self):
        if self._cv2_hls is None:
            if self._hls_gain == None:
                self._cv2_hls = self.parent_hls
            else:
                self._cv2_hls = channel_shift(self.parent_hls, self._hls_chan, self._hls_gain)
        return self._cv2_hls

    @property
    def cv2_bgr(self):
        if self._cv2_bgr is None:
            self._cv2_bgr = self.to_bgr(self.cv2_hls)
        return self._cv2_bgr

    @property
    def cv2_rgb(self):
        if self._cv2_rgb is None:
            self._cv2_rgb = self.to_rgb(self.cv2_bgr)
        return self._cv2_rgb

    def to_hls(self, bgr):
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2HLS_FULL)

    def to_bgr(self, hls):
        return cv2.cvtColor(hls, cv2.COLOR_HLS2BGR_FULL)

    def to_rgb(self, bgr):
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)

In [None]:
sdcic = SDCImageContainer(train_image_dir)
sdcic.load_3x3_grids(
    image_counter_grids_file,
    image_md5hash_grids_file,
    image_bm0hash_grids_file,
    image_entropy_grids_file,
    image_duplicate_tiles_file)

dup_truth = read_duplicate_truth(duplicate_truth_file)
len(dup_truth)

In [None]:
n_matching_tiles = 6
overlay_matches_file = os.path.join("data", f"overlay_matches_{n_matching_tiles}.pkl")
overlay_pixel_scores_file = os.path.join("data", f"overlay_pixel_scores_{n_matching_tiles}.pkl")

matches = defaultdict(list)
df = pd.read_pickle(overlay_matches_file)
for row in tqdm_notebook(df.to_dict('split')['data']):
    matches[(row[0], row[1])].append((row[2], row[3], row[4:]))

overlay_pixel_scores = {}
df = pd.read_pickle(overlay_pixel_scores_file)
for row in tqdm_notebook(df.to_dict('split')['data']):
    if (row[0], row[1]) not in overlay_pixel_scores:
        overlay_pixel_scores[(row[0], row[1])] = {}
    assert row[2] not in overlay_pixel_scores[(row[0], row[1])]
    overlay_pixel_scores[(row[0], row[1])][row[2]] = row[3:]

for (img1_id, img2_id), values in tqdm_notebook(sorted(matches.items())):
    img1_overlay_tags = set([v[0] for v in values])  # all have the same overlay_tag
    if len(values) == 0 or len(img1_overlay_tags) != 1:
        continue
    img1_overlay_tag, overlay_score, tile_scores = values[0]
    pixel_scores = overlay_pixel_scores[(img1_id, img2_id)][img1_overlay_tag]
    entropy_score = get_entropy_score(img1_id, img2_id, img1_overlay_tag, sdcic.tile_entropy_grids)
    sdcic.update_overlay_maps(img1_id, img2_id, img1_overlay_tag, overlay_score, tile_scores, pixel_scores, entropy_score)

print(len(sdcic.overlay_image_maps))

## Find overlapping images using hashlib
Update: The values between two supposedly exact 256x256 crops are not exact (See below).

In [None]:
md5hash_dict = defaultdict(list)
img_ids = os.listdir(train_image_dir)

for img_id in tqdm_notebook(img_ids):
    for h in sdcic.tile_md5hash_grids[img_id]:
        md5hash_dict[h].append(img_id)

dup_counts_dict = defaultdict(int)
for key, dups in md5hash_dict.items():
    dup_counts_dict[len(dups)] += 1

sorted_bin_sizes = sorted(dup_counts_dict.items())
sorted_bin_sizes

In [None]:
batch_size = 9
skip = 365
ii = 0
jj = 0
batch_limit = 9
samples_images = np.empty((batch_limit, 768, 768, 3), dtype=np.float32)

for hash_id, dups in md5hash_dict.items():
    ii += 1
    if len(dups) == batch_size:
        dups0 = list(set(dups))
        img_id = dups0[0]
        idx = sdcic.tile_md5hash_grids[img_id].index(hash_id)
        print(hash_id, len(dups), ii, sdcic.tile_entropy_grids[img_id][idx])
        if jj == min(dup_counts_dict[len(dups)], skip):
            break
        jj += 1

for i, c_img_id in enumerate(dups0[:batch_limit]):
    c_img = cv2.cvtColor(sdcic.get_img(c_img_id), cv2.COLOR_BGR2RGB)
    samples_images[i] = c_img.astype(np.float32) / 255.0

batch_rgb = montage_rgb(samples_images)
print(samples_images.shape)
print(batch_rgb.shape, batch_rgb.dtype)

fig, ax = plt.subplots(1, 1, figsize = (16, 16))
ax.imshow(batch_rgb, vmin=0, vmax=1)
plt.axis('off')
# plt.savefig(os.path.join('out', BASE_MODEL, f"{train_meta_filebase}_{score_str}_batch_{BATCH_NUM}.jpg"))
plt.show()

## Find overlapping images using cv2.img_hash

In [None]:
bm0hash_dict = defaultdict(list)
img_ids = os.listdir(train_image_dir)

for img_id in tqdm_notebook(img_ids):
    for h in sdcic.tile_bm0hash_grids[img_id]:
        bm0hash_dict[tuple(h)].append(img_id)  # hex

dup_counts_dict = defaultdict(int)
for key, dups in bm0hash_dict.items():
    dup_counts_dict[len(dups)] += 1

sorted_bin_sizes = sorted(dup_counts_dict.items())
sorted_bin_sizes

In [None]:
batch_size = 18
skip = 5
ii = 0
jj = 0
batch_limit = 9
samples_images = np.empty((batch_limit, 768, 768, 3), dtype=np.float32)

for hash_id, dups in bm0hash_dict.items():
    ii += 1
    if len(dups) == batch_size:
        dups0 = list(set(dups))
        img_id = dups0[0]
        idx = sdcic.tile_bm0hash_grids[img_id].index(hash_id)
        print(hash_id, len(dups), ii, sdcic.tile_entropy_grids[img_id][idx])
        if jj == min(dup_counts_dict[len(dups)], skip):
            break
        jj += 1

for i, c_img_id in enumerate(dups0[:batch_limit]):
    c_img = cv2.cvtColor(sdcic.get_img(c_img_id), cv2.COLOR_BGR2RGB)
    samples_images[i] = c_img.astype(np.float32) / 255.0

batch_rgb = montage_rgb(samples_images)
print(samples_images.shape)
print(batch_rgb.shape, batch_rgb.dtype)

fig, ax = plt.subplots(1, 1, figsize = (16, 16))
ax.imshow(batch_rgb, vmin=0, vmax=1)
plt.axis('off')
# plt.savefig(os.path.join('out', BASE_MODEL, f"{train_meta_filebase}_{score_str}_batch_{BATCH_NUM}.jpg"))
plt.show()

## Here we explore dup detection using image gradients and cross entropy 

In [None]:
score_lim0 = 0
score_lim1 = 1
for (img1_id, img2_id), overlay_maps in tqdm_notebook(sdcic.overlay_image_maps.items()):
    if img1_id > img2_id:
        # sanity check
        raise ValueError(f'img1_id ({img1_id}) should be lexicographically smaller than img2_id ({img2_id})')
    for img1_overlay_tag, (overlay_score, tile_scores, pixel_scores, entropy_score) in overlay_maps.items():
        if (img1_id, img2_id, img1_overlay_tag) not in dup_truth:
            continue
        is_dup = dup_truth[(img1_id, img2_id, img1_overlay_tag)]

        if is_dup == 0 and entropy_score > score_lim0:
            score_lim0 = entropy_score
            print_score = True
        elif is_dup == 1 and entropy_score < score_lim1:
            score_lim1 = entropy_score
            print_score = True
        else:
            print_score = False

        if print_score:
            img1_entropy_vec = get_entropy(img1_id)
            img2_entropy_vec = get_entropy(img2_id)
            img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
            img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
            n_vec = np.max([img1_entropy_vec_norm, img2_entropy_vec_norm])
            img1_scaled_vec = img1_entropy_vec / n_vec
            img2_scaled_vec = img2_entropy_vec / n_vec
            grad_score = 1.0 - np.linalg.norm(img1_scaled_vec - img2_scaled_vec)

            entropy2 = get_entropy2(img1_id, img2_id)
            entropy2_norm = np.linalg.norm(entropy2)
            
            print('')
            print(f"{is_dup}, {overlay_score:7.5f}, {min(tile_scores):7.5f}, {grad_score:7.5f}, {entropy2_norm}")
            print(img1_id, img1_entropy_vec, f"{img1_entropy_vec_norm}")
            print(img2_id, img2_entropy_vec, f"{img2_entropy_vec_norm}")
            print(get_entropy(img1_id))
            print(get_entropy(img2_id))
            print(entropy2)
            print(entropy_score)


In [None]:
img1_id = '691d5afc2.jpg'
img2_id = '56417e7af.jpg'

In [None]:
img2 = cv2.imread(os.path.join(train_image_dir, img2_id))
img2_grad = np.gradient(img2.astype(np.int), 0.5, axis=(0, 1))
ctr2 = Counter(np.abs(img2_grad[0]).astype(np.uint8).flatten())

In [None]:
img1_entropy_vec = get_entropy(img1_id)
img2_entropy_vec = get_entropy(img2_id)
img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
n_vec = np.max([img1_entropy_vec_norm, img1_entropy_vec_norm])
img1_scaled_vec = img1_entropy_vec / n_vec
img2_scaled_vec = img2_entropy_vec / n_vec
print('')
print(img1_id, img1_entropy_vec, f"{img1_entropy_vec_norm}")
print(img2_id, img2_entropy_vec, f"{img1_entropy_vec_norm}")
print(f"{np.linalg.norm(img1_scaled_vec - img2_scaled_vec)}")

In [None]:
df = pd.read_csv('dup_blacklist_6.csv', sep=', ')
for idx, row in df.iterrows():
    print(idx)
    img1_entropy_vec = get_entropy(row['ImageId1'])
    img1_entropy_vec_u = img1_entropy_vec / np.linalg.norm(img1_entropy_vec)
    print(row['ImageId1'], img1_entropy_vec)
    img2_entropy_vec = get_entropy(row['ImageId2'])
    img2_entropy_vec_u = img2_entropy_vec / np.linalg.norm(img2_entropy_vec)
    print(row['ImageId2'], img2_entropy_vec)
    print(np.dot(img1_entropy_vec_u, img2_entropy_vec_u), np.linalg.norm(img1_entropy_vec - img2_entropy_vec))

# Create an interactive widget for tagging duplicate overlays.

In [None]:
from IPython.display import display
from ipywidgets import Button, Image, Layout, Box, HBox, VBox, Output

## Collect the various scores together

In [None]:
Overlay_Scores = namedtuple('overlay_scores', ['overlay', 'tile', 'pixel', 'entropy'])
overlay_candidates = []
for (img1_id, img2_id), overlay_maps in tqdm_notebook(sdcic.overlay_image_maps.items()):
    if img1_id > img2_id:
        # sanity check
        raise ValueError(f'img1_id ({img1_id}) should be lexicographically smaller than img2_id ({img2_id})')
    for img1_overlay_tag, (overlay_score, tile_scores, pixel_scores, entropy_score) in overlay_maps.items():
        if (img1_id, img2_id, img1_overlay_tag) in dup_truth:
            continue
        overlay_scores = Overlay_Scores(overlay_score, tile_scores, pixel_scores, entropy_score)
        overlay_candidates.append((img1_id, img2_id, img1_overlay_tag, overlay_scores))

## Filter and sort

In [None]:
sdcic.image_image_duplicate_tiles = read_image_image_duplicate_tiles(image_image_duplicate_tiles_file)
overlay_tag_nines_mask = generate_overlay_tag_nines_mask()

duplicate_candidates = []
for img1_id, img2_id, img1_overlay_tag, overlay_scores in sorted(overlay_candidates, key=operator.itemgetter(0, 2), reverse=False):
#     img1_nine, img2_nine = sdcic.image_image_duplicate_tiles[(img1_id, img2_id)]
#     img1_mask = img1_nine[overlay_tag_nines_mask[img1_overlay_tag]]
#     img2_mask = img2_nine[overlay_tag_nines_mask[overlay_tag_pairs[img1_overlay_tag]]]
    
    # This is here so I don't forget to address small 2 tile or 1 tile overlays later.
#     if len(img1_mask) <= 2:
#         print(img1_mask, img2_mask)
#         continue
    
    # (0, 3, 6) == (0, 3, 6) is exact duplicate
#     if len(set(img1_mask)) == len(img1_mask) and np.all(img1_mask == img2_mask) and 9 not in img1_mask:
#         continue
    
    # (0, 0, 0) == (0, 0, 0) skip probably is duplicate of white clouds, or blue border.
#     if len(set(img1_mask) | set(img2_mask)) == 1 and 9 not in img1_mask:
#         continue
    
    # (0, 0, 0) == (2, 2, 2) is NOT duplicate. probably white clouds overlay with blue boarder
#     if len(set(img1_mask)) == 1 and len(set(img2_mask)) == 1 and set(img1_mask) != set(img2_mask) and 9 not in img1_mask and 9 not in img2_mask:
#         continue
        
#     if len(set(img1_mask)) == 1 and len(set(img2_mask)) != 1 and 9 not in img1_mask:
#         if set(img1_mask) != set(img2_mask):
#         continue
    
#     if overlay_scores.entropy <= 0.995 or overlay_scores.entropy >= 0.999:
#         continue
    
    duplicate_candidates.append((img1_id, img2_id, img1_overlay_tag, overlay_scores))

print(len(overlay_candidates), len(duplicate_candidates))

In [None]:
candidates_iter = iter(duplicate_candidates)
overlay_labels = {}

In [None]:
img1_id = None
img2_id = None
img1_overlay_tag = None

box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')

def get_next_img_pair():
    global img1_id, img2_id, img1_overlay_tag
    
    while True:
        img1_id, img2_id, img1_overlay_tag, scores = next(candidates_iter)
        
        if img1_id > img2_id:
            continue
        
        if (img1_id, img2_id, img1_overlay_tag) in overlay_labels:
            continue
        
        img1_nine, img2_nine = sdcic.image_image_duplicate_tiles[(img1_id, img2_id)]
        img1_mask = img1_nine[overlay_tag_nines_mask[img1_overlay_tag]]
        img2_mask = img2_nine[overlay_tag_nines_mask[overlay_tag_pairs[img1_overlay_tag]]]

        if ((len(set(img1_mask)) == 1 and 9 not in img1_mask) or (len(set(img2_mask)) == 1 and 9 not in img2_mask)) and scores.entropy < 0.001:            #one of these images is all white clouds or blue border or black background
            continue

#         if len(set(img1_mask)) == 1 and len(set(img2_mask)) == 1 and set(img1_mask) == set(img2_mask) and 9 not in img1_mask and 9 not in img2_mask:
#             overlay_labels[(img1_id, img2_id, img1_overlay_tag)] = 0
#             continue

#         if scores.entropy > 0.999:
#             continue
        
        if max(scores.pixel) < 200:
            continue
        
        break

    return img1_id, img2_id, img1_overlay_tag, scores

def draw_images(img1_id, img2_id, img1_overlay_tag, scores):
    
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    
    dtick = 256
    n_ticks = imgmod1.shape[1] // dtick + 1
    ticks = [i * dtick for i in range(n_ticks)]

    slice1 = overlay_tag_slices[img1_overlay_tag]
    slice2 = overlay_tag_slices[overlay_tag_pairs[img1_overlay_tag]]

    m12 = np.median(np.vstack([imgmod1.parent_rgb[slice1], imgmod2.parent_rgb[slice2]]), axis=(0, 1), keepdims=True).astype(np.uint8)
    img1_drop = imgmod1.parent_rgb - m12
    img2_drop = imgmod2.parent_rgb - m12
        
    brightness_level = -100 if np.sum(m12) >= 384 else 100
    img1 = imgmod1.channel_shift('L', brightness_level)
    img2 = imgmod2.channel_shift('L', brightness_level)
    
    img1_nine, img2_nine = sdcic.image_image_duplicate_tiles[(img1_id, img2_id)]
    img1_mask = img1_nine[overlay_tag_nines_mask[img1_overlay_tag]]
    img2_mask = img2_nine[overlay_tag_nines_mask[overlay_tag_pairs[img1_overlay_tag]]]
    img1_dups_str = ' '.join(list(map(str, img1_mask)))
    img2_dups_str = ' '.join(list(map(str, img2_mask)))
    
    img1[slice1] = img1_drop[slice1]
    img2[slice2] = img2_drop[slice2]

    fig, ax = plt.subplots(2, 2, figsize=(15, 15))
    ax[0][0].imshow(img1)
    ax[0][0].set_title(f'{img1_id} {scores.overlay:7.5f}')
    ax[0][0].set_xticks(ticks)
    ax[0][0].set_yticks(ticks)

    ax[0][1].imshow(img2)
    ax[0][1].set_title(f'{img2_id} {min(scores.tile):7.5f}')
    ax[0][1].set_xticks(ticks)
    ax[0][1].set_yticks(ticks)
    
    img1[slice1] = imgmod1.parent_rgb[slice1]
    img2[slice2] = imgmod2.parent_rgb[slice2]

    ax[1][0].imshow(img1)
    ax[1][0].set_title(f'({img1_dups_str}) {scores.entropy:7.5f}')
    ax[1][0].set_xticks(ticks)
    ax[1][0].set_yticks(ticks)

    ax[1][1].imshow(img2)
    ax[1][1].set_title(f'({img2_dups_str}) {max(scores.pixel)}')
    ax[1][1].set_xticks(ticks)
    ax[1][1].set_yticks(ticks)
    
    return ax
    
def redraw(img1_id, img2_id, img1_overlay_tag, scores):
    out.clear_output(True)
    with out:
        ax = draw_images(img1_id, img2_id, img1_overlay_tag, scores)
        plt.show()
    

In [None]:
same_button = Button(
    description='Same',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are the same',
    icon='check'
)

diff_button = Button(
    description='Diff',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='danger', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Overlays are different',
    icon='x'
)

skip_button = Button(
    description='Skip',
    disabled=False,
    layout=Layout(flex='1 1 auto', width='auto'), 
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Not sure.  Skip for now.',
    icon='?'
)

def on_same_button_clicked(b):
    global img1_id, img2_id, img1_overlay_tag
    overlay_labels[(img1_id, img2_id, img1_overlay_tag)] = 1
    img1_id, img2_id, img1_overlay_tag, scores = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlay_tag, scores)
    
def on_diff_button_clicked(b):
    global img1_id, img2_id, img1_overlay_tag
    overlay_labels[(img1_id, img2_id, img1_overlay_tag)] = 0
    img1_id, img2_id, img1_overlay_tag, scores = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlay_tag, scores)
    
def on_skip_button_clicked(b):
    global img1_id, img2_id, img1_overlay_tag
    img1_id, img2_id, img1_overlay_tag, scores = get_next_img_pair()
    redraw(img1_id, img2_id, img1_overlay_tag, scores)

same_button.on_click(on_same_button_clicked)
diff_button.on_click(on_diff_button_clicked)
skip_button.on_click(on_skip_button_clicked)

In [None]:
out = Output()
buttons_3 = Box(children=[same_button, diff_button, skip_button], layout=box_layout)
display(VBox([out, buttons_3]))

img1_id, img2_id, img1_overlay_tag, overlay_scores = get_next_img_pair()
with out:
    ax = draw_images(img1_id, img2_id, img1_overlay_tag, overlay_scores)
    plt.show()

In [None]:
sdcic.tile_entropy_grids['00374ccfa.jpg']

In [None]:
sdcic.tile_entropy_grids['218bb7055.jpg']

In [None]:
len(overlay_labels), 100*len(overlay_labels)/len(duplicate_candidates)

In [None]:
overlay_labels

In [None]:
for k, v in overlay_labels.items():
    continue
k

In [None]:
del overlay_labels[('b887a184a.jpg', 'ca57c33ce.jpg', '0010')]

In [None]:
overlay_labels[('0ee790381.jpg', '2f6c0deaa.jpg', '1020')] = 0

In [None]:
update_duplicate_truth(duplicate_truth_file, overlay_labels)

After we create image_duplicate_tiles.txt, lets check to see how many duplicate tiles we actually have.

In [None]:
image_duplicate_tiles = read_image_duplicate_tiles(image_duplicate_tiles_file)

In [None]:
dup_tiles = []
dup_hashes = {}
dup_files = []
dup_counts = {}
for img_id, img_dup9 in sdcic.image_duplicate_tiles.items():
    img = None
    c0 = Counter(img_dup9)
    for i, c in c0.items():
        if c == 1:
            continue
        for ii in np.where(img_dup9 == i)[0]:
            new_hash = sdcic.tile_md5hash_grids[img_id][ii]
            if new_hash in dup_hashes:
                dup_counts[new_hash] += 1
                if img_id not in dup_hashes[new_hash]:
                    dup_hashes[new_hash][img_id] = []
                dup_hashes[new_hash][img_id].append(ii)
            else:
                dup_counts[new_hash] = 1
                dup_hashes[new_hash] = {}
                dup_hashes[new_hash][img_id] = []
                dup_hashes[new_hash][img_id].append(ii)
                dup_files.append(img_id)
                img = sdcic.get_img(img_id)
                new_tile = sdcic.get_tile(img, ii)
                dup_tiles.append(new_tile)
                print(len(dup_files)-1, new_hash, img_id, ii)
dup_counts

In [None]:
dup_hashes

In [None]:
for ii, dup_tile in enumerate(dup_tiles):
    print(ii)
    print(dup_tile[2, 2], dup_tile[2, -2])
    print(dup_tile[-2, 2], dup_tile[-2, -2])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
img1 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[0])), cv2.COLOR_BGR2RGB)
ax1.imshow(img1)
img2 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[5])), cv2.COLOR_BGR2RGB)
ax2.imshow(img2)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
img1 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[1])), cv2.COLOR_BGR2RGB)
ax1.imshow(img1)
img2 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[2])), cv2.COLOR_BGR2RGB)
ax2.imshow(img2)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
img1 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[3])), cv2.COLOR_BGR2RGB)
ax1.imshow(img1)
img2 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, dup_files[4])), cv2.COLOR_BGR2RGB)
ax2.imshow(img2)

In [None]:
black_tile = np.zeros((256, 256, 3), dtype=np.uint8)
white_tile = black_tile + 255
blue_tile = np.copy(black_tile) 
blue_tile[:, :, 0] = 255
red_tile = np.copy(black_tile) 
red_tile[:, :, 2] = 255
color_tiles = [black_tile, white_tile, blue_tile, red_tile]
for color_tile in color_tiles:
    print(img_hash.blockMeanHash(color_tile, mode=0)[0])
    print(hashlib.md5(color_tile.tobytes()).hexdigest())


In [None]:
black_images = ['03ffa7680.jpg', '8d5521663.jpg', '5a70ef013.jpg', '9a2f9d347.jpg', '37a912dca.jpg', '4add7aa1d.jpg', '3db3ef7cc.jpg', '73fec0637.jpg', '7df214d98.jpg', 'c2955cd21.jpg', 'de018b2a8.jpg', '8ce769141.jpg', 'fc0e22a0a.jpg', '770c46cd4.jpg', 'd6e432b79.jpg', 'd5d1b6fb8.jpg', '0e4d7dd93.jpg', '9ddeed533.jpg', 'addc11de0.jpg', '65418dfe4.jpg', '119d6a3d6.jpg', '1b287c905.jpg', 'b264b0f96.jpg', '996f92939.jpg', 'e5c3b1f59.jpg']
fig, ax = plt.subplots(5, 5, figsize=(15, 15))
for i, img_id in enumerate(black_images):
    img = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, img_id)), cv2.COLOR_BGR2RGB)
    ax[i // 5, i % 5].imshow(img)
    ax[i // 5, i % 5].set_title(img_id)
plt.tight_layout()

## Try out a decision tree classifier for dup_truth

In [None]:
from sklearn import tree
import graphviz

In [None]:
L = []
X = []
Y = []
for (img1_id, img2_id, img1_overlay_tag), is_dup in dup_truth.items():
    if (img1_id, img2_id) not in sdcic.overlay_image_maps:
#         print((img1_id, img2_id, img1_overlay_tag))
        continue
    overlay_maps = sdcic.overlay_image_maps[(img1_id, img2_id)]
    if img1_overlay_tag not in overlay_maps:
#         print(img1_overlay_tag)
        continue
    (overlay_score, tile_scores, pixel_scores, entropy_score) = overlay_maps[img1_overlay_tag]
    L.append((img1_id, img2_id, img1_overlay_tag))
    X.append([
#         min(tile_scores), 
        max(pixel_scores), 
        entropy_score])
    Y.append([is_dup])

L = np.array(L)
X = np.array(X)
Y = np.array(Y)
# X = [[0, 0], [1, 1]]
# Y = [0, 1]
print(len(X))
print(len(Y), sum(Y))

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=['max(pixel)', 'entropy'], 
                                filled=True, rounded=True, special_characters=True, leaves_parallel=True) 
graph = graphviz.Source(dot_data) 
graph

In [None]:
all_nodes = clf.apply(X)

In [None]:
nodes = np.where(all_nodes == 4)
idx = 0
print(L[nodes][idx], Y[nodes][idx], X[nodes][idx])
print(sdcic.overlay_image_maps[(L[nodes][idx][0], L[nodes][idx][1])])

In [None]:
# 6
['1ebdf2f08.jpg', 'b1bfb768c.jpg', '0012'] [1] [91.          0.99781223]
{'0012': (
    1.0, 
    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
    [0, 0, 0, 91, 0, 0], 
    0.9978122327282525)}

['2323bf875.jpg', 'b5da61fce.jpg', '0021'] [1] [2.05663500e+06 9.98277186e-01]
{'0021': (
    0.9913398692810458, 
    [0.984313725490196, 1.0, 1.0, 0.996078431372549, 0.9686274509803922, 0.9990196078431373], 
    [2043928, 1911748, 2056635, 2046531, 2053888, 2019553], 
    0.9982771859297246)}

['468bf9178.jpg', '6090b3a8b.jpg', '1022'] [1] [1.30900000e+03 9.97640283e-01]
{'1022': (
    1.0, 
    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
    [721, 1259, 1278, 1309, 753, 358], 
    0.9976402831644123)}

['0ef6cd331.jpg', 'e6a6f80cd.jpg', '1022'] [0] [1.72270000e+04 9.98394555e-01]
{'1022': (
    0.9866830065359478, 
    [0.9965686274509804, 1.0, 0.9529411764705882, 1.0, 1.0, 0.9705882352941176], 
    [17227, 0, 1561, 0, 0, 16281], 
    0.9983945550575503)}

['d4f0aaa70.jpg', 'd84d4a78a.jpg', '0012'] [0] [5.95230000e+04 9.98578088e-01]
{'0012': (
    0.9937704248366013, 
    [0.96875, 1.0, 1.0, 0.9938725490196079, 1.0, 1.0], 
    [59523, 0, 0, 43375, 0, 0], 
    0.9985780878066167)}

# 4
['de6fb187d.jpg', 'ea6dc23b7.jpg', '1021'] [1] [223.           0.99544613]
{'1021': (
    1.0, 
    [1.0, 1.0, 1.0, 1.0], 
    [0, 223, 0, 2], 
    0.9954461317609192)}

['c3193fb05.jpg', 'cc68e7818.jpg', '0112'] [0] [2.16300000e+04 9.98311792e-01]
{'0112': (
    0.9834865196078432, 
    [1.0, 0.9743872549019608, 1.0, 0.9595588235294118], 
    [0, 21630, 0, 5512], 
    0.9983117921966159)}

['cd3c59923.jpg', 'efdd03319.jpg', '1021'] [0] [6.70246000e+05 9.99894307e-01]
{'1021': (
    0.9917279411764706, 
    [0.9669117647058824, 1.0, 1.0, 1.0], 
    [670246, 0, 0, 0], 
    0.9998943068660762)}

# 3
['2f6c0deaa.jpg' 'e44a4f5b0.jpg' '0222'] [1] [24.          0.99509307]
{'0222': (1.0, [1.0, 1.0, 1.0], [24, 0, 0], 0.9950930748673227)}

['204906e27.jpg' '892a69b4b.jpg' '0002'] [1] [6.31644000e+05 9.97614902e-01]
{'0002': (0.9787173202614379, [0.9685049019607843, 1.0, 0.9676470588235294], [588063, 631644, 588677], 0.9976149023387387)}

['4c56d2f00.jpg' 'dcd94e973.jpg' '2022'] [1] [6.31635000e+05 9.97534103e-01]
{'2022': (0.9787173202614379, [0.9685049019607843, 1.0, 0.9676470588235294], [588032, 631635, 588642], 0.9975341029628989)}

['b645cd49b.jpg' 'f2e554691.jpg' '2022'] [1] [3.76847000e+05 9.96659721e-01]
{'2022': (0.986233660130719, [1.0, 0.9872549019607844, 0.9714460784313725], [376847, 328985, 314896], 0.996659721397496)}

['b998c7415.jpg' 'd4d26f700.jpg' '2022'] [1] [3.76847000e+05 9.96680501e-01]
{'2022': (0.986233660130719, [1.0, 0.9872549019607844, 0.9714460784313725], [376847, 328986, 314908], 0.9966805008347782)}

['0ef6cd331.jpg' '3a9e579aa.jpg' '2022'] [0] [1.62810000e+04 9.98394555e-01]
{'2022': (0.9901960784313726, [1.0, 1.0, 0.9705882352941176], [0, 0, 16281], 0.9983945550575503)}

['0ef6cd331.jpg' '813c8ec35.jpg' '0222'] [0] [1.79442000e+05 9.98195859e-01]
{'0222': (0.9838643790849673, [1.0, 1.0, 0.9515931372549019], [0, 0, 179442], 0.9981958590284148)}

['813c8ec35.jpg' 'caa94ffc3.jpg' '0020'] [0] [1.76759000e+05 9.99834742e-01]
{'0020': (0.9944035947712418, [1.0, 1.0, 0.9832107843137254], [0, 0, 176759], 0.9998347424252452)}

['0256ef90d.jpg' '46da51931.jpg' '0020'] [0] [3.70260000e+05 9.99319673e-01]
{'0020': (0.9838643790849673, [1.0, 0.986764705882353, 0.964828431372549], [0, 135489, 370260], 0.9993196729978772)}

['a61b3e245.jpg' 'd84d4a78a.jpg' '2022'] [0] [2.59134100e+06 9.99175738e-01]
{'2022': (0.985702614379085, [0.9584558823529412, 0.9986519607843137, 1.0], [2591341, 38980, 0], 0.9991757382992734)}

# 2


## Check the performance of DupNet

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = load_checkpoint('out/dup_model.last.pth')
model.cuda()
print(model)

In [None]:
def plot_image_pair(img1_id, img2_id, img1_overlay_tag):
    
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    dcnn_scores_raw = gen_dcnn_scores(imgmod1.parent_rgb, imgmod2.parent_rgb, img1_overlay_tag, model)
    print(dcnn_scores_raw)
    
    dtick = 256
    n_ticks = imgmod1.shape[1] // dtick + 1
    ticks = [i * dtick for i in range(n_ticks)]

    slice1 = overlay_tag_slices[img1_overlay_tag]
    slice2 = overlay_tag_slices[overlay_tag_pairs[img1_overlay_tag]]

    m12 = np.median(np.vstack([imgmod1.parent_rgb[slice1], imgmod2.parent_rgb[slice2]]), axis=(0, 1), keepdims=True).astype(np.uint8)
    
    brightness_level = -100 if np.sum(m12) >= 384 else 100
    img1 = imgmod1.channel_shift('L', brightness_level)
    img2 = imgmod2.channel_shift('L', brightness_level)
    
    img1_drop = imgmod1.parent_rgb - m12
    img2_drop = imgmod2.parent_rgb - m12
    
    img1[slice1] = img1_drop[slice1]
    img2[slice2] = img2_drop[slice2]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
    
    ax1.imshow(img1)
    ax1.set_title(f'{img1_id}')
    ax1.set_xticks(ticks)
    ax1.set_yticks(ticks)
    
    ax2.imshow(img2)
    ax2.set_title(f'{img2_id}')
    ax2.set_xticks(ticks)
    ax2.set_yticks(ticks)

In [None]:
plot_image_pair('cd3c59923.jpg', 'efdd03319.jpg', '1021')

In [None]:
dup_truth1 = {}
for key, is_dup in dup_truth.items():
    if is_dup:
        continue
    dup_truth1[key] = is_dup
len(dup_truth1)

In [None]:
ii = 1
dup_dict = {}
for (img1_id, img2_id, img1_overlay_tag), is_dup in dup_truth1.items():
    ii += 1
    if img1_id > img2_id:
        img1_id, img2_id = img2_id, img1_id
        img1_overlay_tag = overlay_tag_pairs[img1_overlay_tag]

    if (img1_id, img2_id, img1_overlay_tag) in dup_dict:
        continue

    img1 = cv2.imread(os.path.join(train_image_dir, img1_id))
    img2 = cv2.imread(os.path.join(train_image_dir, img2_id))

    entropy_score = get_entropy_score(img1_id, img2_id, img1_overlay_tag, sdcic.tile_entropy_grids)
    overlay_score = get_overlay_score(img1_id, img2_id, img1_overlay_tag, sdcic.tile_bm0hash_grids)
    tile_scores = get_tile_scores(img1_id, img2_id, img1_overlay_tag, sdcic.tile_bm0hash_grids)
    pixel_scores = gen_pixel_scores(img1, img2, img1_overlay_tag)
    dcnn_scores_raw = gen_dcnn_scores(img1, img2, img1_overlay_tag, model)
    dcnn_scores = (dcnn_scores_raw > 0.5) * 1

    dup_dict[(img1_id, img2_id, img1_overlay_tag)] = (is_dup, overlay_score, min(tile_scores))

    if is_dup == np.max(dcnn_scores) == 0:
        continue

    if is_dup == np.min(dcnn_scores) == 1:
        continue

#     print(f"{ii:>3} {img1_id} {img2_id}               {is_dup} {np.min(dcnn_scores)} {image_score:6.4f}, {min(tile_scores):6.4f}]", pixel_scores, dcnn_scores_raw)
    print(f"{ii:>3} {img1_id} {img2_id} {img1_overlay_tag} {is_dup} {entropy_score} {image_score}")
    print(pixel_scores)
#     print(dcnn_scores)
    print(dcnn_scores_raw)

In [None]:
for (img1_id, img2_id, img1_overlay_tag), (is_dup, image_score, tiles_score) in dup_dict.items():
#     if is_dup < 0:
#         continue
    
    filename = os.path.join('temp', f"{img1_id}_{img2_id}")
    if os.path.exists(filename):
        continue

    img1 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, img1_id)), cv2.COLOR_BGR2RGB)
    img2 = cv2.cvtColor(cv2.imread(os.path.join(train_image_dir, img2_id)), cv2.COLOR_BGR2RGB)
    
    overlay_score = get_overlay_score(img1_id, img2_id, img1_overlay_tag, sdcic.tile_bm0hash_grids)
    tile_scores = get_tile_scores(img1_id, img2_id, img1_overlay_tag, sdcic.tile_bm0hash_grids)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
    
    ax1.imshow(img1)
    ax1.set_title(f'{img1_id} ({overlay_score:7.5f})')
    
    ax2.imshow(img2)
    ax2.set_title(f'{img2_id} ({min(tile_scores):7.5f})')
    
    fig.savefig(filename)
    fig.clear()

In [None]:
def plot_image_pair_with_mask(img1_id, img2_id, img1_overlay_tag):
#     if is_dup == 0:
#         continue
#     has_mask = 0
#     if os.path.exists(os.path.join(train_mask_dir, img1_id)):
#         has_mask += 1
#     if os.path.exists(os.path.join(train_mask_dir, img2_id)):
#         has_mask += 1
#     if has_mask == 0:
#         continue
    imgmod1 = ImgMod(os.path.join(train_image_dir, img1_id))
    imgmod2 = ImgMod(os.path.join(train_image_dir, img2_id))
    
    dtick = 256
    n_ticks = imgmod1.shape[1] // dtick + 1
    ticks = [i * dtick for i in range(n_ticks)]

    img1 = imgmod1.parent_rgb
    img2 = imgmod2.parent_rgb
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
    
    ax1.imshow(img1)
#     if os.path.exists(os.path.join(train_mask_dir, img1_id)):
#         mask1 = cv2.imread(os.path.join(train_mask_dir, img1_id))
#         mask1 = cv2.cvtColor(mask1, cv2.COLOR_BGR2RGB)
#         ax1.imshow(mask1[..., 0], alpha=0.5)
    ax1.set_title(f'{img1_id}')
    ax1.set_xticks(ticks)
    ax1.set_yticks(ticks)
    
    ax2.imshow(img2)
#     if os.path.exists(os.path.join(train_mask_dir, img2_id)):
#         mask2 = cv2.imread(os.path.join(train_mask_dir, img2_id))
#         mask2 = cv2.cvtColor(mask2, cv2.COLOR_BGR2RGB)
#         ax2.imshow(mask2[..., 0], alpha=0.5)
    ax2.set_title(f'{img2_id}')
    ax2.set_xticks(ticks)
    ax2.set_yticks(ticks)
    
#     fig.savefig(os.path.join('temp', f"{img1_id}_{img2_id}_mask.jpg"));
#     fig.clear();

In [None]:
for (img1_id, img2_id, img1_overlay_tag), (is_dup, tiles_score, tiles_score1) in dup_dict.items():
    plot_image_pair_with_mask(img1_id, img2_id, img1_overlay_tag)

In [None]:
for key, value in sorted(dup_dict.items(), key=lambda x: x[1][2]):
    has_mask = []
    if value[0]:
        if os.path.exists(os.path.join(train_mask_dir, key[0])):
            has_mask.append(1)
        else:
            has_mask.append(0)
        if os.path.exists(os.path.join(train_mask_dir, key[1])):
            has_mask.append(1)
        else:
            has_mask.append(0)
        print(f'{key[0]}, {has_mask[0]}, {key[1]}, {has_mask[1]}')