In [None]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm_notebook

from sdcdup.utils import get_project_root
from sdcdup.utils import load_duplicate_truth
from sdcdup.features import load_image_overlap_properties

%load_ext dotenv
%dotenv
%reload_ext autoreload
%autoreload 2

project_root = get_project_root()
train_image_dir = os.path.join(project_root, os.getenv('RAW_DATA_DIR'), 'train_768')
processed_data_dir = os.path.join(project_root, os.getenv('PROCESSED_DATA_DIR'))

In [None]:
dup_truth = load_duplicate_truth()
print(len(dup_truth))

In [None]:
# score_types = ['bmh', 'cmh', 'epy', 'enp']
overlap_image_maps = load_image_overlap_properties()
print(len(overlap_image_maps))

## Here we explore dup detection using image gradients and cross entropy 

In [None]:
def get_channel_entropy(ctr, img_size=1769472):  # 768*768*3 = 1769472
    ctr_norm = {k: v / img_size for k, v in sorted(ctr.items())}
    ctr_entropy = {k: -v * np.log(v) for k, v in ctr_norm.items()}
    entropy = np.sum([k * v for k, v in ctr_entropy.items()])
    return entropy

def get_entropy(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).flatten())
        entropy_list.append(get_channel_entropy(ctr, img.size))
    return np.array(entropy_list)

def get_entropy1(img_id):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    img_grad = np.gradient(img.astype(np.int), 0.5, axis=(0, 1))
    entropy_list = []
    for channel_grad in img_grad:
        ctr = Counter(np.abs(channel_grad).astype(np.uint8).flatten())
        entropy_list.append(ctr)
    return entropy_list

def get_entropy2(img1_id, img2_id):
    entropy1_list = get_entropy1(img1_id)
    entropy2_list = get_entropy1(img2_id)
    entropy_list = []
    for ctr1, ctr2 in zip(entropy1_list, entropy2_list):
        ctr = (ctr1 - ctr2) + (ctr2 - ctr1)
        entropy_list.append(get_channel_entropy(ctr))
    return np.array(entropy_list)

In [None]:
score_lim0 = 0
score_lim1 = 1
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    if img1_id > img2_id:
        # sanity check
        raise ValueError(f'img1_id ({img1_id}) should be lexicographically smaller than img2_id ({img2_id})')
    for img1_overlap_tag, scores in overlap_maps.items():
        if (img1_id, img2_id, img1_overlap_tag) not in dup_truth:
            continue
        
        is_dup = dup_truth[(img1_id, img2_id, img1_overlap_tag)]

        if is_dup == 0 and np.max(scores.enp) > score_lim0:
            score_lim0 = np.max(scores.enp)
            print_score = True
        elif is_dup == 1 and np.max(scores.enp) < score_lim1:
            score_lim1 = np.max(scores.enp)
            print_score = True
        else:
            print_score = False

        if print_score:
            img1_entropy_vec = get_entropy(img1_id)
            img2_entropy_vec = get_entropy(img2_id)
            img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
            img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
            n_vec = np.max([img1_entropy_vec_norm, img2_entropy_vec_norm])
            img1_scaled_vec = img1_entropy_vec / n_vec
            img2_scaled_vec = img2_entropy_vec / n_vec
            grad_score = 1.0 - np.linalg.norm(img1_scaled_vec - img2_scaled_vec)

            entropy2 = get_entropy2(img1_id, img2_id)
            entropy2_norm = np.linalg.norm(entropy2)
            
            print('')
            print(f'{is_dup}, {min(scores.bmh):7.5f}, {min(scores.cmh):7.5f}, {grad_score:7.5f}, {entropy2_norm}')
            print(img1_id, img1_entropy_vec, f'{img1_entropy_vec_norm}')
            print(img2_id, img2_entropy_vec, f'{img2_entropy_vec_norm}')
            print(get_entropy(img1_id))
            print(get_entropy(img2_id))
            print(entropy2)
            print(np.max(scores.enp))


In [None]:
img1_id = '691d5afc2.jpg'
img2_id = '56417e7af.jpg'

In [None]:
img1_entropy_vec = get_entropy(img1_id)
img2_entropy_vec = get_entropy(img2_id)
img1_entropy_vec_norm = np.linalg.norm(img1_entropy_vec)
img2_entropy_vec_norm = np.linalg.norm(img2_entropy_vec)
n_vec = np.max([img1_entropy_vec_norm, img1_entropy_vec_norm])
img1_scaled_vec = img1_entropy_vec / n_vec
img2_scaled_vec = img2_entropy_vec / n_vec
print('')
print(img1_id, img1_entropy_vec, f'{img1_entropy_vec_norm}')
print(img2_id, img2_entropy_vec, f'{img1_entropy_vec_norm}')
print(f'{np.linalg.norm(img1_scaled_vec - img2_scaled_vec)}')

In [None]:
df = pd.read_csv(os.path.join(processed_data_dir, 'dup_blacklist_6.csv'), sep=', ')
for idx, row in df.iterrows():
    print(idx)
    img1_entropy_vec = get_entropy(row['ImageId1'])
    img1_entropy_vec_u = img1_entropy_vec / np.linalg.norm(img1_entropy_vec)
    print(row['ImageId1'], img1_entropy_vec)
    img2_entropy_vec = get_entropy(row['ImageId2'])
    img2_entropy_vec_u = img2_entropy_vec / np.linalg.norm(img2_entropy_vec)
    print(row['ImageId2'], img2_entropy_vec)
    print(np.dot(img1_entropy_vec_u, img2_entropy_vec_u), np.linalg.norm(img1_entropy_vec - img2_entropy_vec))