In [None]:
import os
import time
import json
import pickle
import hashlib
from collections import defaultdict
from collections import Counter
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from skimage.util import montage
import cv2
from cv2 import img_hash

from sdcdup.utils import get_project_root
from sdcdup.utils import overlap_tag_maps
from sdcdup.utils import overlap_tag_pairs
from sdcdup.utils import generate_pair_tag_lookup
from sdcdup.utils import get_hamming_distance
from sdcdup.utils import get_hamming_distance_array
from sdcdup.features import SDCImageContainer
from sdcdup.visualization import get_ticks
from sdcdup.visualization import draw_overlap_bbox

%load_ext dotenv
%dotenv
%matplotlib inline
%reload_ext autoreload
%autoreload 2

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16
BIGGEST_SIZE = 20
plt.rc('font', size=BIGGEST_SIZE)         # controls default text sizes
plt.rc('axes', titlesize=BIGGEST_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=BIGGEST_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

montage_rgb = lambda x: np.stack([montage(x[:, :, :, i]) for i in range(x.shape[3])], -1)

project_root = get_project_root()
train_image_dir = os.path.join(project_root, os.getenv('RAW_DATA_DIR'), 'train_768')
interim_data_dir = os.path.join(project_root, os.getenv('INTERIM_DATA_DIR'))
pair_tag_lookup = generate_pair_tag_lookup()
ticks = get_ticks()

matches_white = {
    'bmh32': tuple(np.ones(32, dtype='uint8') * 255),
    'bmh96': tuple(np.ones(96, dtype='uint8') * 255)
}

In [None]:
matches_metric = 'bmh32'
matches_threshold = 0.9

sdcic = SDCImageContainer()
sdcic.load_image_metrics(['md5', 'bmh32', 'bmh96'])
img_ids = os.listdir(train_image_dir)

## All have the same blockMeanHash, but they each have different md5 hashes.

In [None]:
black_tile = np.zeros((256, 256, 3), dtype=np.uint8)
white_tile = black_tile + 255
blue_tile = np.copy(black_tile)
blue_tile[:, :, 0] = 255
red_tile = np.copy(black_tile)
red_tile[:, :, 2] = 255
color_tiles = [black_tile, white_tile, blue_tile, red_tile]
for color_tile in color_tiles:
    print(hashlib.md5(color_tile.tobytes()).hexdigest())
    print(img_hash.blockMeanHash(color_tile, mode=0)[0])
    # ...for each color channel
    hash0 = img_hash.blockMeanHash(color_tile[..., 0], mode=0)
    hash1 = img_hash.blockMeanHash(color_tile[..., 1], mode=0)
    hash2 = img_hash.blockMeanHash(color_tile[..., 2], mode=0)
    print(np.hstack([hash0, hash1, hash2])[0])

## Find overlapping images with hashlib.md5
Update: The values between two supposedly exact 256x256 crops are not always exact (See below).

In [None]:
md5hash_dict = defaultdict(list)
for img_id in tqdm_notebook(img_ids):
    for h in sdcic.img_metrics['md5'][img_id]:
        md5hash_dict[h].append(img_id)

In [None]:
dup_counts_dict = defaultdict(int)
for key, dups in md5hash_dict.items():
    dup_counts_dict[len(dups)] += 1

sorted_bin_sizes = sorted(dup_counts_dict.items())
print('n images with k duplicates')
print('(k, n)')
sorted_bin_sizes

In [None]:
batch_size = 9
skip = 365
ii = 0
jj = 0

for hash_id, dups in md5hash_dict.items():
    ii += 1
    if len(dups) == batch_size:
        dups0 = list(set(dups))
        img_id = dups0[0]
        idx = sdcic.img_metrics['md5'][img_id].tolist().index(hash_id)
        print(hash_id, len(dups), ii)
        if jj == min(dup_counts_dict[len(dups)], skip):
            break
        jj += 1

batch_limit = 9
samples_images = np.empty((batch_limit, 768, 768, 3), dtype=np.float32)
for i, c_img_id in enumerate(dups0[:batch_limit]):
    c_img = cv2.cvtColor(sdcic.get_img(c_img_id), cv2.COLOR_BGR2RGB)
    samples_images[i] = c_img.astype(np.float32) / 255.0

batch_rgb = montage_rgb(samples_images)

fig, ax = plt.subplots(1, 1, figsize = (16, 16))
ax.imshow(batch_rgb, vmin=0, vmax=1)
plt.axis('off')
plt.show()

## Find overlapping images with cv2.blockMeanHash 
(Using only exact first matches)

In [None]:
# TODO: Use filter for all overlaps here?
# img_ids = filter_duplicates(img_ids)

bm0hash_dict = defaultdict(set)
for img_id in tqdm_notebook(img_ids):
    for h in sdcic.img_metrics[matches_metric][img_id]:
        bm0hash_dict[tuple(h)].add(img_id)  # hex

bm0hash_dict.pop(matches_white[matches_metric])

sorted_hash_dict = {}
for key, dups in sorted(bm0hash_dict.items(), key=lambda x: len(x[1]), reverse=True):
    if len(dups) > 1:
        sorted_hash_dict[key] = sorted(dups)

In [None]:
dup_counts_dict = defaultdict(int)
for key, dups in bm0hash_dict.items():
    dup_counts_dict[len(dups)] += 1

sorted_bin_sizes = sorted(dup_counts_dict.items())
print('n images with k duplicates')
print('(k, n)')
sorted_bin_sizes

In [None]:
def generate_matches(sorted_hash_dict, sdcic, matches_metric, matches_threshold):

    test_matches = set()
    for hash_id, img_list in tqdm_notebook(sorted_hash_dict.items()):

        hamming_lookup = {img_id: get_hamming_distance_array(sdcic.img_metrics[matches_metric][img_id], np.asarray(hash_id)[None, :], normalize=True, as_score=True) for img_id in img_list}
        
        temp_matches = set()
        for img1_id in img_list:
            tiles1 = [idx for idx, bmhd in enumerate(hamming_lookup[img1_id]) if bmhd >= matches_threshold]
            for img2_id in img_list:
                if img2_id <= img1_id:
                    continue
                tiles2 = [idx for idx, bmhd in enumerate(hamming_lookup[img2_id]) if bmhd >= matches_threshold]

                # create a set of valid overlap_tags based on matching image tiles.
                overlap_tags = set()
                for t1 in tiles1:
                    for t2 in tiles2:
                        overlap_tags.add(pair_tag_lookup.get((t1, t2)))

                for img1_overlap_tag in overlap_tags:
                    temp_matches.add((img1_id, img2_id, img1_overlap_tag))

        test_matches.update(temp_matches)
        
    return test_matches

def generate_matches2(test_matches, sdcic, matches_metric, matches_threshold):
    matches = set()
    for match in tqdm_notebook(sorted(test_matches)):
        bmh_scores = sdcic.overlap_scores_config[matches_metric]['func'](*match)
        if min(bmh_scores) < matches_threshold:
            continue
        matches.add(tuple(match))
    
    return matches

In [None]:
test_matches = generate_matches(sorted_hash_dict, sdcic, matches_metric, matches_threshold)
print(len(test_matches))

In [None]:
matches = generate_matches2(test_matches, sdcic, matches_metric, matches_threshold)
print(len(matches))

In [None]:
matches_file = f'matches_{matches_metric}_{matches_threshold}.csv'
full_matches_file = os.path.join(interim_data_dir, matches_file)
df = pd.DataFrame(sorted(matches))
df.to_csv(full_matches_file, index=False)

In [None]:
batch_size = 18
skip = 5
ii = 0
jj = 0

for hash_id, dups in bm0hash_dict.items():
    ii += 1
    if len(dups) == batch_size:
        dups0 = list(set(dups))
        img_id = dups0[0]
        idx = np.where(np.all(sdcic.img_metrics[matches_metric][img_id] == np.asarray(hash_id), axis=1))[0]
        print(hash_id, len(dups), ii)
        if jj == min(dup_counts_dict[len(dups)], skip):
            break
        jj += 1

batch_limit = 9
samples_images = np.empty((batch_limit, 768, 768, 3), dtype=np.float32)
for i, c_img_id in enumerate(dups0[:batch_limit]):
    c_img = cv2.cvtColor(sdcic.get_img(c_img_id), cv2.COLOR_BGR2RGB)
    samples_images[i] = c_img.astype(np.float32) / 255.0

batch_rgb = montage_rgb(samples_images)

fig, ax = plt.subplots(1, 1, figsize = (16, 16))
ax.imshow(batch_rgb, vmin=0, vmax=1)
plt.axis('off')
plt.show()

## Find overlapping images with cv2.blockMeanHash 
(Using first matches within some threshold.)

In [None]:
score_to_hashes_file = os.path.join(interim_data_dir, f'score_to_hashes_{matches_metric}.pkl')
score_to_hashes_file

In [None]:
hash_dict = defaultdict(set)

for img_id in tqdm_notebook(sorted(img_ids)):
    for h in sdcic.img_metrics[matches_metric][img_id]:
        hash_dict[tuple(h)].add(img_id)

hash_dict.pop(matches_white[matches_metric])

sorted_hash_dict = {}
for key, dups in sorted(hash_dict.items(), key=lambda x: len(x[1]), reverse=True):
    if len(dups) > 1:
        sorted_hash_dict[key] = sorted(dups)

In [None]:
bmh_groups = Counter()
for key in sorted_hash_dict:
    bmh_groups[np.sum(np.unpackbits(key))] += 1

for key, val in sorted(bmh_groups.items()):
    print(key, val)

In [None]:
score_to_hashes0 = defaultdict(set)

for img_id in tqdm_notebook(sorted(img_ids)):
    for h in sdcic.img_metrics[matches_metric][img_id]:
        score = np.sum(np.unpackbits(h))
        score_to_hashes0[int(score)].add(tuple(h))
        
score_to_hashes0.pop(sdcic.img_metrics_config[matches_metric]['len']*8)

score_to_hashes1 = defaultdict(list)
for score, h_tup_set in tqdm_notebook(sorted(score_to_hashes0.items())):
    for h_tup in sorted(h_tup_set):
        score_to_hashes1[score].append(tuple(map(int, h_tup)))
print(len(score_to_hashes1))

In [None]:
with open(score_to_hashes_file, 'wb') as ofs:
    pickle.dump(score_to_hashes1, ofs, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(score_to_hashes_file, 'rb') as ofs:
    score_to_hashes1 = pickle.load(ofs)

In [None]:
score_to_hashes = defaultdict(list)
for score, h_tup_list in tqdm_notebook(score_to_hashes1.items()):
    for h_tup in sorted(h_tup_list):
        score_to_hashes[score].append(np.array(h_tup, dtype=np.uint8))

In [None]:
for score, hashes in sorted(score_to_hashes.items()):
    print(score, len(hashes))

In [None]:
def bit_difference(b1, b2):
    return np.sum(b1[:, None, :] ^ b2[None, :, :], dtype=np.int, axis=2)

def get_array_splits(size, max_size):
    n_splits = (size - 1) // max_size
    pivot = size // (n_splits + 1)
    pivots = [ii * pivot for ii in range(n_splits + 1)] + [size]
    splits = [(p1, p2) for p1, p2 in zip(pivots[:-1], pivots[1:])]
    return splits

def parallel_process(b1, b2, max_size):

    b1_splits = get_array_splits(b1.shape[0], max_size)
    b2_splits = get_array_splits(b2.shape[0], max_size)
    
    splits = []
    b1_chunks = []
    b2_chunks = []
    for b1_split in b1_splits:
        for b2_split in b2_splits:
            splits.append((b1_split, b2_split))
            b1_chunks.append(b1[b1_split[0]:b1_split[1]])
            b2_chunks.append(b2[b2_split[0]:b2_split[1]])

    bit_diff = np.zeros((b1.shape[0], b2.shape[0]), dtype=np.int)
    with ThreadPoolExecutor() as executor:
        for split, bit_diff_chunk in zip(splits, executor.map(bit_difference, b1_chunks, b2_chunks)):
            bit_diff[split[0][0]:split[0][1], split[1][0]:split[1][1]] = bit_diff_chunk
            
    return bit_diff

def get_bit_difference(b1, b2, max_size):
    
    if b1.shape[0] > max_size or b2.shape[0] > max_size:
        return parallel_process(b1, b2, max_size)
    else:
        return bit_difference(b1, b2)

In [None]:
for bmh96
bounds < 2000
max_size = 500
max_offset = 3
       Workers              Time
single       1  598.7040269374847
thread       4  295.7433452606201
thread      12  230.2517523765564
thread      18  228.97077655792236
thread     all  225.48815441131592
process      4  626.4333462715149

max_offset = 3
workers: all
      max_size              Time
thread     500  225.48815441131592
thread     400  199.03486013412476
thread     200  110.82181811332703
thread     100   98.9623429775238
thread      50  113.35412073135376

5000 < bounds < 6000
max_offset = 1
workers: all
      max_size              Time
process    500   63.29715895652771
process    400   62.84799337387085
process    200   61.1794650554657
process    100   69.73269319534302
thread     500   36.114161252975464
thread     200   27.83505415916443
thread     100   30.965204000473022

15000 < bounds < 16000
max_offset = 1
workers: all
      max_size              Time
process    500   34.47118520736694
process    200   33.95744609832764
process    100   37.45988988876343
thread     500   32.89252519607544
thread     200   26.59809684753418
thread     100   27.964319229125977

24780 < bounds
max_offset = 1
workers: all
      max_size              Time
process   1000   88.20187282562256
process    500   82.89880657196045
process    400   82.8500907421112
process    300   81.7402081489563
process    200   79.99603962898254
process    100   91.45985507965088
thread     500   89.29903173446655
thread     300   86.33439707756042
thread     200   69.23344659805298, 67.5252251625061
thread     100   76.26239681243896

for bmh32
40900 < bounds
max_offset = 1
workers: all
      max_size              Time
thread     400   75.88064217567444
thread     350   73.0563383102417
thread     300   75.30764889717102
thread     200   77.82986497879028


In [None]:
max_offset = int(sdcic.img_metrics_config[matches_metric]['len']*8*(1.0-matches_threshold))+1
# max_offset = 1
max_size = 350
print(max_offset, max_size)

In [None]:
t00 = time.time()
overlap_tile_candidates = {}
ii = 0
for s1, h1 in tqdm_notebook(sorted(score_to_hashes.items())):

#     if len(h1) < 40900:
#         continue
#     if len(h1) > 400:
#         continue

    for offset in range(max_offset):
        
        t0 = time.time()
        s2 = s1 + offset + 1
        
        if s2 not in score_to_hashes:
            continue
            
        h2 = score_to_hashes[s2]
    
#         if len(h2) < 41000:
#             continue
#         if len(h2) > 400:
#             continue

        b1 = np.unpackbits(h1, axis=1)
        b2 = np.unpackbits(h2, axis=1)
        res = get_bit_difference(b1, b2, max_size)
        overlap_tile_candidate = np.argwhere(res <= max_offset)
        if len(overlap_tile_candidate) == 0:
            continue
            
        overlap_tile_candidates[(s1, s2)] = overlap_tile_candidate
        print(f'{ii}, {s1}, {s2}, {offset + 1}, {b1.shape[0]}, {b2.shape[0]}, {len(overlap_tile_candidate)}, {time.time() - t0}')
        ii += 1

print(time.time() - t00)

In [None]:
overlap_tile_candidates_file = os.path.join(interim_data_dir, f'prematch_candidates_{matches_metric}_{matches_threshold}.csv')
overlap_tile_candidates_file

In [None]:
with open(overlap_tile_candidates_file, 'w') as ofs:
    for (idx1, idx2), arr in tqdm_notebook(overlap_tile_candidates.items()):
        if len(arr) == 0:
            continue
        ofs.write(','.join(map(str, [idx1, idx2, *arr.flatten()])) + '\n')

In [None]:
overlap_tile_candidates = {}
with open(overlap_tile_candidates_file, 'r') as ifs:
    for line in tqdm_notebook(ifs.readlines()):
        idx1_str, idx2_str, *arr_str = line.strip().split(',')
        arr = np.array(list(map(np.int64, arr_str)))
        overlap_tile_candidates[(int(idx1_str), int(idx2_str))] = arr.reshape((-1, 2))

In [None]:
flip_count = 0
match_count = 0
total_count = 0
test_matches = set()
for (s1, s2), hash_index_pairs in tqdm_notebook(overlap_tile_candidates.items()):
    for hidx1, hidx2 in hash_index_pairs:
        h1 = score_to_hashes[s1][hidx1]
        h2 = score_to_hashes[s2][hidx2]
        for img1_id in list(hash_dict[tuple(h1)]):
            t1 = np.where(np.all(sdcic.img_metrics[matches_metric][img1_id] == h1, axis=1))[0][0]
            for img2_id in list(hash_dict[tuple(h2)]):
                t2 = np.where(np.all(sdcic.img_metrics[matches_metric][img2_id] == h2, axis=1))[0][0]
                total_count += 1
                if img1_id == img2_id:
                    continue
                elif img2_id < img1_id:
                    test_matches.add((img2_id, img1_id, pair_tag_lookup.get((t2, t1))))
                    flip_count += 1
                else:
                    test_matches.add((img1_id, img2_id, pair_tag_lookup.get((t1, t2))))
                match_count += 1
                if match_count % 1_000_000 == 0:
                    print(s1, s2, flip_count, len(test_matches), total_count)
                
print(s1, s2, flip_count, len(test_matches), total_count)

In [None]:
matches = set()
for img1_id, img2_id, img1_overlap_tag in tqdm_notebook(test_matches):
    bmh_scores = sdcic.overlap_scores_config[matches_metric]['func'](img1_id, img2_id, img1_overlap_tag)
    if min(bmh_scores) < matches_threshold:
        continue
    matches.add((img1_id, img2_id, img1_overlap_tag))
len(matches)

In [None]:
matches_file = os.path.join(interim_data_dir, f'matches_{matches_metric}_{matches_threshold}_offset.csv')
df = pd.DataFrame(sorted(matches))
df.to_csv(matches_file, index=False)

In [None]:
overlap_tags_by_size = defaultdict(list)
for overlap_tag_map, indexes in overlap_tag_maps.items():
    overlap_tags_by_size[len(indexes)].append(overlap_tag_map)

overlap_candidates_stats = Counter()
for n_tiles, overlap_tags in sorted(overlap_tags_by_size.items(), reverse=True):
    print(n_tiles, overlap_tags)
    for (img1_id, img2_id, img1_overlap_tag) in matches:
        if img1_overlap_tag not in overlap_tags:
            continue
        overlap_candidates_stats[(n_tiles, img1_overlap_tag)] += 1

for key, cts in sorted(overlap_candidates_stats.items(), key=lambda x: x[1]):
    print(key, f'{cts:>6}')

In [None]:
# img1_id, img2_id, img1_overlap_tag = matches[1804]
match_str = '0209f50e1.jpg c6b55566f.jpg 07'
match_str = '0b8ce2b47.jpg ae1508781.jpg 18'
match_str = '0d403a5dc.jpg fb91d24aa.jpg 38'
match_str = '0d604c106.jpg 21d7ea9bf.jpg 38'
match_str = '0dab350e9.jpg 8c7b3dbe6.jpg 08'
match_str = '0e0e77e04.jpg ca09e27e2.jpg 18'
match_str = '0e3c1baba.jpg 53331228d.jpg 08'
match_str = '115ca0d9c.jpg 6cb57577b.jpg 08'
match_str = '144b1d485.jpg 193b28b01.jpg 38'
match_str = '1ae8df736.jpg 9a1c53871.jpg 08'
match_str = '1ae8df736.jpg 9a1c53871.jpg 38'
match_str = '1bdba4a27.jpg f90042ce7.jpg 08'
# match_str = '1ec51371b.jpg ee94d427e.jpg 08'
# match_str = '21d7ea9bf.jpg 89d46f4c4.jpg 08'
img1_id, img2_id, img1_overlap_tag = match_str.split()
img1_id, img2_id, img1_overlap_tag, max_hamm, avg_hamm = t_matches[25]
# img1_overlap_tag = '28'
img1 = cv2.cvtColor(sdcic.get_img(img1_id), cv2.COLOR_BGR2RGB)
img2 = cv2.cvtColor(sdcic.get_img(img2_id), cv2.COLOR_BGR2RGB)
GREEN = (76, 175, 80)
bbox_thickness = 8
bbox_color = GREEN
draw_overlap_bbox(img1, img1_overlap_tag, bbox_thickness, bbox_color)
draw_overlap_bbox(img2, overlap_tag_pairs[img1_overlap_tag], bbox_thickness, bbox_color)

fig, ax = plt.subplots(1, 2, figsize = (16, 16))
ax[0].imshow(img1)
ax[0].set_title(f"{img1_id}  max: {max_hamm}")
ax[0].set_xticks(ticks)
ax[0].set_yticks(ticks)
ax[1].imshow(img2)
ax[1].set_title(f"{img2_id}  avg: {avg_hamm}")
ax[1].set_xticks(ticks)
ax[1].set_yticks(ticks)
plt.show()

In [None]:
matches_file8 = os.path.join(interim_data_dir, 'matches_bmh96_0.8.csv')
df = pd.read_csv(matches_file8, dtype=str)
matches8 = df.to_dict('split')['data']
matches8_list = []
for match in matches8:
    matches8_list.append(tuple(match))
len(matches8_list)

In [None]:
t_matches = []
for img1_id, img2_id, img1_overlap_tag in sorted(matches8_list):
    if img1_id == '115ca0d9c.jpg':
        print(img1_id, img2_id, img1_overlap_tag, sdcic.get_bmh_scores(img1_id, img2_id, img1_overlap_tag))
    if (img1_id, img2_id, img1_overlap_tag) not in matches:
        scores = sdcic.get_bmh_scores(img1_id, img2_id, img1_overlap_tag)
        if len(overlap_tag_maps[img1_overlap_tag]) <= 4:
            continue
        if np.max(scores) < 1:
            img1_overlap_map = overlap_tag_maps[img1_overlap_tag]
            img2_overlap_map = overlap_tag_maps[overlap_tag_pairs[img1_overlap_tag]]
            print('')
            hamm_total = 0
            max_hamm = 0
            for idx1, idx2 in zip(img1_overlap_map, img2_overlap_map):
                h1 = sdcic.img_metrics['bmh96'][img1_id][idx1]
                h2 = sdcic.img_metrics['bmh96'][img2_id][idx2]
                b1 = np.unpackbits(h1)
                b2 = np.unpackbits(h2)
                hamm = np.sum(b1 ^ b2, dtype=np.int)
                max_hamm = hamm if hamm > max_hamm else max_hamm
                hamm_total += hamm
                avg_hamm = hamm_total // len(img1_overlap_map)
                print(img1_id, img2_id, img1_overlap_tag, idx1, idx2, f"{hamm:>3} {avg_hamm:>3}")
            t_matches.append((img1_id, img2_id, img1_overlap_tag, max_hamm, avg_hamm))
#             break

In [None]:
matches_file9 = os.path.join(interim_data_dir, 'matches_bmh96_0.9.csv')
df = pd.read_csv(matches_file9, dtype=str)
matches9 = df.to_dict('split')['data']
matches9_list = []
for match in matches9:
    matches9_list.append(tuple(match))
len(matches9_list)

In [None]:
eqsum_cts = 0
for img1_id, img2_id, img1_overlap_tag in sorted(matches9_list):
    if (img1_id, img2_id, img1_overlap_tag) not in matches:
        scores = sdcic.get_bmh_scores(img1_id, img2_id, img1_overlap_tag)
        if len(overlap_tag_maps[img1_overlap_tag]) <= 4:
            continue
        if np.max(scores) < 1:
            img1_overlap_map = overlap_tag_maps[img1_overlap_tag]
            img2_overlap_map = overlap_tag_maps[overlap_tag_pairs[img1_overlap_tag]]
            print('')
            hamm_total = 0
            for idx1, idx2 in zip(img1_overlap_map, img2_overlap_map):
                h1 = sdcic.img_metrics['bmh96'][img1_id][idx1]
                h2 = sdcic.img_metrics['bmh96'][img2_id][idx2]
                b1 = np.unpackbits(h1)
                b2 = np.unpackbits(h2)
                m1 = sdcic.img_metrics['md5'][img1_id][idx1]
                m2 = sdcic.img_metrics['md5'][img2_id][idx2]
                hamm = np.sum(b1 ^ b2, dtype=np.int)
                hamm_total += hamm
                print(img1_id, img2_id, img1_overlap_tag, idx1, idx2, f"{hamm:>3} {hamm_total//len(img1_overlap_map):>3}", m1, m2)
#             break