In [2]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm_notebook

from sdcdup.utils import load_duplicate_truth
from sdcdup.features.image_features import load_image_overlap_properties

%reload_ext autoreload
%autoreload 2

# SENDTOENV
train_image_dir = 'data/raw/train_768/'

In [13]:
dup_truth = load_duplicate_truth()
print(len(dup_truth))

128950


In [11]:
# score_types = ['bmh', 'cmh', 'con', 'hom', 'eng', 'cor', 'epy', 'enp', 'pix', 'px0', 'shp']
n_matching_tiles_list = [9, 6, 4, 3]
overlap_image_maps = load_image_overlap_properties(n_matching_tiles_list)
print(len(overlap_image_maps))

100%|██████████| 259/259 [00:00<00:00, 60104.28it/s]
100%|██████████| 82823/82823 [00:01<00:00, 62487.19it/s]
100%|██████████| 72629/72629 [00:01<00:00, 46587.05it/s]
100%|██████████| 75936/75936 [00:01<00:00, 60209.20it/s]


227355


## Here we explore dup detection using image gradients and cross entropy 

In [4]:
def get_channel_entropy(ctr, img_size=1769472):  # 768x768x3
    ctr_norm = {k: v / img_size for k, v in sorted(ctr.items())}
    ctr_entropy = {k: -v * np.log(v) for k, v in ctr_norm.items()}
    entropy = np.sum([k * v for k, v in ctr_entropy.items()])
    return entropy

def get_entropy(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).flatten())
        entropy_list.append(get_channel_entropy(ctr, img.size))
    return np.array(entropy_list)

def get_entropy1(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), 0.5, axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).astype(np.uint8).flatten())
        entropy_list.append(ctr)
    return entropy_list

def get_entropy2(img1_id, img2_id):
    entropy1_list = get_entropy1(img1_id)
    entropy2_list = get_entropy1(img2_id)
    entropy_list = []
    for ctr1, ctr2 in zip(entropy1_list, entropy2_list):
        ctr = (ctr1 - ctr2) + (ctr2 - ctr1)
        entropy_list.append(get_channel_entropy(ctr))
    return np.array(entropy_list)

In [14]:
score_lim0 = 0
score_lim1 = 1
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    if img1_id > img2_id:
        # sanity check
        raise ValueError(f'img1_id ({img1_id}) should be lexicographically smaller than img2_id ({img2_id})')
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) not in dup_truth:
            continue
        
        is_dup = dup_truth[(img1_id, img2_id, img1_overlap_tag)]

        if is_dup == 0 and np.max(scores.enp) > score_lim0:
            score_lim0 = np.max(scores.enp)
            print_score = True
        elif is_dup == 1 and np.max(scores.enp) < score_lim1:
            score_lim1 = np.max(scores.enp)
            print_score = True
        else:
            print_score = False

        if print_score:
            img1_entropy_vec = get_entropy(img1_id)
            img2_entropy_vec = get_entropy(img2_id)
            img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
            img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
            n_vec = np.max([img1_entropy_vec_norm, img2_entropy_vec_norm])
            img1_scaled_vec = img1_entropy_vec / n_vec
            img2_scaled_vec = img2_entropy_vec / n_vec
            grad_score = 1.0 - np.linalg.norm(img1_scaled_vec - img2_scaled_vec)

            entropy2 = get_entropy2(img1_id, img2_id)
            entropy2_norm = np.linalg.norm(entropy2)
            
            print('')
            print(f'{is_dup}, {min(scores.bmh):7.5f}, {min(scores.cmh):7.5f}, {grad_score:7.5f}, {entropy2_norm}')
            print(img1_id, img1_entropy_vec, f'{img1_entropy_vec_norm}')
            print(img2_id, img2_entropy_vec, f'{img2_entropy_vec_norm}')
            print(get_entropy(img1_id))
            print(get_entropy(img2_id))
            print(entropy2)
            print(np.max(scores.enp))


HBox(children=(IntProgress(value=0, max=227355), HTML(value='')))


1, 0.99219, 0.99954, 0.99147, 3.407160624749316
00021ddc3.jpg [31.25587833 33.27560909] 45.65299650941682
7ca331f03.jpg [31.52266151 33.56411072] 46.04593051087031
[31.25587833 33.27560909]
[31.52266151 33.56411072]
[2.4595837  2.35779379]
0.990201267411494

1, 0.97656, 0.99974, 0.90180, 39.46644093399122
00ce2c1c0.jpg [70.2441663  88.34734123] 112.86937405938849
8e170847a.jpg [63.40054861 79.62931949] 101.78633546451947
[70.2441663  88.34734123]
[63.40054861 79.62931949]
[24.54014226 30.90924421]
0.8316051346208106

1, 0.96875, 0.99977, 0.90262, 42.41073107475384
0325c0ae2.jpg [76.62711606 93.65979495] 121.01186762490204
c299c4851.jpg [69.19673716 84.51296692] 109.22742334957546
[76.62711606 93.65979495]
[69.19673716 84.51296692]
[26.68430839 32.96388624]
0.8106966444081986

0, 0.98047, 0.93372, 0.85611, 0.007134785639817069
0ef6cd331.jpg [0.00565908 0.00508091] 0.007605309705280226
2095da0cb.jpg [0.00459349 0.00533009] 0.007036329685245373
[0.00565908 0.00508091]
[0.00459349 0.00533

KeyboardInterrupt: 

In [7]:
img1_id = '691d5afc2.jpg'
img2_id = '56417e7af.jpg'

In [8]:
img1_entropy_vec = get_entropy(img1_id)
img2_entropy_vec = get_entropy(img2_id)
img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
n_vec = np.max([img1_entropy_vec_norm, img1_entropy_vec_norm])
img1_scaled_vec = img1_entropy_vec / n_vec
img2_scaled_vec = img2_entropy_vec / n_vec
print('')
print(img1_id, img1_entropy_vec, f'{img1_entropy_vec_norm}')
print(img2_id, img2_entropy_vec, f'{img1_entropy_vec_norm}')
print(f'{np.linalg.norm(img1_scaled_vec - img2_scaled_vec)}')


691d5afc2.jpg [3.94034022 4.35665835] 5.874244885888801
56417e7af.jpg [13.06340313 17.32728654] 5.874244885888801
2.699534368645378


In [5]:
df = pd.read_csv('data/processed/dup_blacklist_6.csv', sep=', ')
for idx, row in df.iterrows():
    print(idx)
    img1_entropy_vec = get_entropy(row['ImageId1'])
    img1_entropy_vec_u = img1_entropy_vec / np.linalg.norm(img1_entropy_vec)
    print(row['ImageId1'], img1_entropy_vec)
    img2_entropy_vec = get_entropy(row['ImageId2'])
    img2_entropy_vec_u = img2_entropy_vec / np.linalg.norm(img2_entropy_vec)
    print(row['ImageId2'], img2_entropy_vec)
    print(np.dot(img1_entropy_vec_u, img2_entropy_vec_u), np.linalg.norm(img1_entropy_vec - img2_entropy_vec))

  """Entry point for launching an IPython kernel.


0
00021ddc3.jpg [31.25587833 33.27560909]
7ca331f03.jpg [31.52266151 33.56411072]
0.999999997782324 0.39294586549146265
1
0325c0ae2.jpg [76.62711606 93.65979495]
c299c4851.jpg [69.19673716 84.51296692]
0.9999999293861828 11.784523477805706
2
03a5fd8d2.jpg [67.90503771 76.395065  ]
676f4cfd0.jpg [61.4407624 69.0712135]
0.9999999318781267 9.768605640571934
3
053af774f.jpg [67.81021288 83.66730719]
8175e0b3d.jpg [61.54553494 75.80713158]
0.9999996457599276 10.051295945776511
4
0714173fd.jpg [64.08851073 79.41514541]
44ebbee2b.jpg [70.57990889 87.61612316]
0.9999996151357298 10.459172344294211
5
0a814feb5.jpg [62.02775408 68.34858778]
9956b7091.jpg [55.94837654 61.63384584]
0.9999999918067333 9.057957308411954
6
0dfd42c61.jpg [21.88167046 18.73323455]
9e1e72979.jpg [21.87103774 18.73022473]
0.9999999870814895 0.011050499667936907
7
10a6d5405.jpg [72.46752317 88.95808625]
a8ec4d506.jpg [65.53761716 80.36819497]
0.9999998721434803 11.036749042603036
8
110d6d7f0.jpg [59.2789396  73.61766418]


KeyboardInterrupt: 