In [None]:
from collections import Counter
from collections import defaultdict

import numpy as np
import networkx as nx
from tqdm import tqdm_notebook

from sdcdup.utils import generate_tag_pair_lookup
from sdcdup.utils import load_duplicate_truth
from sdcdup.utils import update_tile_cliques
from sdcdup.features import SDCImageContainer
from sdcdup.features import load_image_overlap_properties

%reload_ext autoreload
%autoreload 2

img_overlap_index_maps = generate_tag_pair_lookup()

In [None]:
sdcic = SDCImageContainer()
sdcic.load_image_metrics(['md5', 'sol'])

In [None]:
dup_truth = load_duplicate_truth()
print(len(dup_truth))

In [None]:
n_matching_tiles_list = [9, 6, 4, 3, 2]
overlap_image_maps = load_image_overlap_properties(n_matching_tiles_list, sdcic)
print(len(overlap_image_maps))

## Get all the overlap_image_maps that are not in dup_truth.

In [None]:
overlap_candidates = []
for (img1_id, img2_id), overlap_maps in tqdm_notebook(overlap_image_maps.items()):
    for img1_overlap_tag in overlap_maps:
        if (img1_id, img2_id, img1_overlap_tag) in dup_truth:
            continue
        overlap_candidates.append((img1_id, img2_id, img1_overlap_tag))
print(len(overlap_candidates))

## Create list of flat hashes. 
(i.e. hashes for tiles where every pixel is the same color)

In [None]:
solid_hashes = set()
for img_id, tile_issolid_grid in sdcic.img_metrics['sol'].items():
    idxs = set(np.where(tile_issolid_grid >= 0)[0])
    for idx in idxs:
        if np.all(tile_issolid_grid[idx] >= 0):
            solid_hashes.add(sdcic.img_metrics['md5'][img_id][idx])

print(solid_hashes)

### Automatically find dups using dicts

In [None]:
tile_hash_dup_dict = defaultdict(set)
tile_hash_dif_dict = defaultdict(set)

for (img1_id, img2_id, img1_overlap_tag), is_dup in dup_truth.items():
    
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        
        if is_dup:

            if tile1_hash in solid_hashes or tile2_hash in solid_hashes:
                continue

            tile_hash_dup_dict[tile1_hash].add(tile1_hash)
            tile_hash_dup_dict[tile2_hash].add(tile2_hash)
            tile_hash_dup_dict[tile1_hash].add(tile2_hash)
            tile_hash_dup_dict[tile2_hash].add(tile1_hash)
        
        else:
            if tile1_hash == tile2_hash:
                continue

            tile_hash_dif_dict[tile1_hash].add(tile2_hash)
            tile_hash_dif_dict[tile2_hash].add(tile1_hash)
            
print(len(tile_hash_dup_dict), len(tile_hash_dif_dict))

# Sanity check: hashes cannot be simultaneously "a dup" and "not a dup" of tile1_hash
for tile1_hash in tile_hash_dup_dict:
    if len(tile_hash_dup_dict[tile1_hash].intersection(tile_hash_dif_dict[tile1_hash])) != 0:
        print(tile1_hash, tile_hash_dup_dict[tile1_hash], tile_hash_dif_dict[tile1_hash])
    assert len(tile_hash_dup_dict[tile1_hash].intersection(tile_hash_dif_dict[tile1_hash])) == 0
    
# Sanity check: If B and C are dups of A, then make sure C not in tile_hash_dif_dict[B]
for tile1_hash, tile1_dups in tile_hash_dup_dict.items():
    for tile1_dup1 in sorted(tile1_dups):
        for tile1_dup2 in sorted(tile1_dups):
            if tile1_dup1 in tile_hash_dif_dict[tile1_dup2]:
                print(tile1_hash, tile1_dup1, tile_hash_dif_dict[tile1_dup2])
            assert tile1_dup1 not in tile_hash_dif_dict[tile1_dup2]

# Now we should be able to form cliques: (i.e. If A == B and B == C, then A == C)
for tile1_hash, tile1_dups in tile_hash_dup_dict.items():
    for tile1_dup1 in sorted(tile1_dups):
        for tile1_dup2 in sorted(tile1_dups):
            if tile1_dup1 <= tile1_dup2:
                continue
            tile_hash_dup_dict[tile1_dup1].add(tile1_dup2)
            tile_hash_dup_dict[tile1_dup2].add(tile1_dup1)

neighbor_counts = Counter()
for tile1_hash, tile1_dups in tile_hash_dup_dict.items():
    neighbor_counts[len(tile1_dups)] += 1
list(sorted(neighbor_counts.items()))

In [None]:
auto_overlap_labels_0 = {}

for img1_id, img2_id, img1_overlap_tag in overlap_candidates:
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        
        if tile1_hash in tile_hash_dif_dict[tile2_hash]:
            assert tile2_hash in tile_hash_dif_dict[tile1_hash]
            auto_overlap_labels_0[(img1_id, img2_id, img1_overlap_tag)] = 0
            break

print(len(auto_overlap_labels_0))

In [None]:
auto_overlap_labels_1 = {}

for img1_id, img2_id, img1_overlap_tag in overlap_candidates:
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        
        if tile1_hash in tile_hash_dup_dict[tile2_hash]:
            assert tile2_hash in tile_hash_dup_dict[tile1_hash]
            continue
        else:
            break
    else:
        auto_overlap_labels_1[(img1_id, img2_id, img1_overlap_tag)] = 1

print(len(auto_overlap_labels_1))

### Using cliques (networkx)

In [None]:
tile_hash_dup_cliques = nx.Graph()
tile_hash_dif_cliques = nx.Graph()

for (img1_id, img2_id, img1_overlap_tag), is_dup in dup_truth.items():
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        if is_dup:
            if tile1_hash in solid_hashes or tile2_hash in solid_hashes:
                continue
            update_tile_cliques(tile_hash_dup_cliques, tile1_hash, tile2_hash)
        else:
            if tile1_hash == tile2_hash:
                continue
            tile_hash_dif_cliques.add_edge(tile1_hash, tile2_hash)

print(tile_hash_dup_cliques.number_of_nodes(), tile_hash_dif_cliques.number_of_nodes())

neighbor_counts = Counter()
for tile_hashes in nx.connected_components(tile_hash_dup_cliques):
    neighbor_counts[len(tile_hashes)] += 1
list(sorted(neighbor_counts.items()))

#### Separately

In [None]:
auto_overlap_labels_0 = {}

for img1_id, img2_id, img1_overlap_tag in overlap_candidates:
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        
        if tile1_hash in tile_hash_dif_cliques and tile2_hash in set(nx.neighbors(tile_hash_dif_cliques, tile1_hash)):
            auto_overlap_labels_0[(img1_id, img2_id, img1_overlap_tag)] = 0
            break

print(len(auto_overlap_labels_0))

In [None]:
auto_overlap_labels_1 = {}

for img1_id, img2_id, img1_overlap_tag in overlap_candidates:
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        
        if tile1_hash in tile_hash_dup_cliques and tile2_hash in set(nx.neighbors(tile_hash_dup_cliques, tile1_hash)):
            continue
        else:
            break
    else:
        auto_overlap_labels_1[(img1_id, img2_id, img1_overlap_tag)] = 1

print(len(auto_overlap_labels_1))

In [None]:
auto_overlap_labels = {}
for key in auto_overlap_labels_0:
    assert key not in auto_overlap_labels_1
auto_overlap_labels.update(auto_overlap_labels_0)
auto_overlap_labels.update(auto_overlap_labels_1)
print(len(auto_overlap_labels))

#### Simultaneously

In [None]:
auto_overlap_labels = {}

for img1_id, img2_id, img1_overlap_tag in overlap_candidates:
    if (img1_id, img2_id, img1_overlap_tag) in auto_overlap_labels:
        continue
    is_dup = 1
    for idx1, idx2 in img_overlap_index_maps[img1_overlap_tag]:
        
        tile1_hash = sdcic.img_metrics['md5'][img1_id][idx1]
        tile2_hash = sdcic.img_metrics['md5'][img2_id][idx2]
        
        if tile1_hash in tile_hash_dif_cliques and tile2_hash in set(nx.neighbors(tile_hash_dif_cliques, tile1_hash)):
            is_dup = 0
            break
        elif tile1_hash in tile_hash_dup_cliques and tile2_hash in set(nx.neighbors(tile_hash_dup_cliques, tile1_hash)):
            continue
        else:
            is_dup = -1

    if is_dup == -1:
        continue
    
    auto_overlap_labels[(img1_id, img2_id, img1_overlap_tag)] = is_dup

print(len(auto_overlap_labels))