## Duplicate image detection

https://www.kaggle.com/iezepov/get-hash-from-images-slightly-daster

https://blog.iconfinder.com/detecting-duplicate-images-using-python-cb240b05a3b6

https://www.pyimagesearch.com/2017/11/27/image-hashing-opencv-python/

In [1]:
# %matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import os
import csv
from resources.conv_learner import *

# required to get individual channels out of image object
from PIL import Image, ImageSequence

In [None]:
# make csv from images in folder

def HPAv18_csv_from_dir(path, folder, outfile):
    _fnames = read_dir(path, folder)
    
    # note use of rsplit (!)
    df_fnames = pd.DataFrame([os.path.basename(_fnames[i]).split('.')[0].rsplit('_',1)[0] for i, _ in enumerate(_fnames)])
    uni_fnames = df_fnames[0].drop_duplicates(); # only unique fnames
   
    
    return uni_fnames

In [None]:
path = 'datasets/Kaggle_HPA_2018/'
folder = 'HPA_multiproc_test'
outfile = 'HPAv18_labels_test.csv'

HPAv18_df = HPAv18_csv_from_dir(path, folder, outfile)

In [None]:
HPAv18_df.to_csv(path + outfile, index=False, header=False)

In [None]:
# generate dhashes of images for 'needles' and 'haystack'

In [29]:
from tqdm import tqdm_notebook as tqdm
from HPAv18_ImCompare import dhash

def get_im_paths(path, folders):
    
    fnames = []
    
    for f in folders:
        _fnames = read_dir(path, f)
        _fnames = [[f, _fnames[i]] for i, _ in enumerate(_fnames)] # remove idx to get all !!!
        fnames = fnames + _fnames
    
    res = np.vstack(fnames)
#     df = pd.DataFrame(fnames, columns=['folder', 'Path_Id'])
    print(f"Total images: {len(res)}")
    return res


def get_im_hash_ch0(im_paths):
    
    res = []
    print(f"dhashing images...")

    for im_name in tqdm(im_paths, total = len(im_paths), unit="files"):
        im = Image.open(path + im_name)

        # might want to add options for different channels here
        im_ch_0 = ImageSequence.Iterator(im)[0]

        h = dhash(im_ch_0)
        res.append([im_name, h])
        
    return res

In [30]:
# define sources
path = 'datasets/Kaggle_HPA_2018/'
needles_folders = ['test_needles']
haystack_folders = ['test_haystack']

# get image paths:
needles_path = get_im_paths(path, needles_folders)
haystack_path = get_im_paths(path, haystack_folders)

# hash_list = get_im_hash_ch0(im_paths.Path_Id)
needles = np.vstack(get_im_hash_ch0(needles_path[:,1]))
haystack = np.vstack(get_im_hash_ch0(haystack_path[:,1]))

# pd.DataFrame(hash_list, columns=['Id', 'Ch0_dhash'])

Total images: 35
Total images: 104
dhashing images...


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


dhashing images...


HBox(children=(IntProgress(value=0, max=104), HTML(value='')))




In [5]:
# detect duplicates based on hamming_distance between dhashes

In [31]:
def hamming_distance(s1, s2):
    """Return the Hamming distance between equal-length sequences"""
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length")
    return sum(el1 != el2 for el1, el2 in zip(s1, s2))

def find_dupes(needles, haystack):
    dupes = []
    counter = 0

    for f, n in needles:
        dist = np.array([hamming_distance(n,h) for h in haystack[:,1]])
        idx = np.flatnonzero(dist < 10)
        _dupes = haystack[:,0][idx]
        if len(_dupes) > 0: # duplicate found!
            counter += 1
            dupes.append([f, _dupes, dist[idx]])

    print(f'Duplicates found: {counter}')

    return pd.DataFrame(dupes, columns=['Id','duplicates', 'hamming_dist'])

In [32]:
dupe_df = find_dupes(needles, haystack)
dupe_df

Duplicates found: 7


Unnamed: 0,Id,duplicates,hamming_dist
0,test_needles\00ad3e84-bad1-11e8-b2b8-ac1f6b643...,[test_haystack\00ad3e84-bad1-11e8-b2b8-ac1f6b6...,"[0, 0, 0, 2]"
1,test_needles\1183_50_A11_1.tiff,[test_haystack\1183_50_A11_1.tiff],[0]
2,test_needles\1183_50_A11_2.tiff,[test_haystack\1183_50_A11_2.tiff],[0]
3,test_needles\1183_51_A11_1.tiff,[test_haystack\1183_51_A11_1.tiff],[0]
4,test_needles\1183_51_A11_2.tiff,[test_haystack\1183_51_A11_2.tiff],[0]
5,test_needles\1183_52_A11_1.tiff,[test_haystack\1183_52_A11_1.tiff],[0]
6,test_needles\1183_52_A11_2.tiff,[test_haystack\1183_52_A11_2.tiff],[0]


In [None]:
# append purged_df to HPA_labels.csv and save as HPA_extended_data.csv
# Option 1: delete duplicate images in HPAv18 folder; then proceed...
# Option 2: bash copy all original data into HPAv18 (faster than other way around)
# Option 3: leave files in respective folders; modify csv's to include partial path...

In [123]:
def remove_dupes(csv_file):
    
    with open(path + csv_file, 'r') as d:
        reader = csv.reader(d)
        fnames = np.vstack(list(reader))  
    
    fnames_df_purge = pd.DataFrame(fnames[1:], columns=['Id', 'Target'])
    orig_df = fnames_df_purge.copy()
    
    for im in dupe_df.duplicates:
        for d in im:
            dup = os.path.basename(d).split('.')[0]
            dup_idx = fnames_df_purge[(fnames_df_purge.Id == dup)].index
            fnames_df_purge = fnames_df_purge.drop(dup_idx)
            
    print(f'Total images removed: {len(orig_df) - len(purged_df)}')

    return fnames_df_purge, orig_df

In [124]:
csv_file = 'HPAv18RBGY_wodpl.csv'
purged_haystack_df, orig_df = remove_dupes(csv_file)

# save purged csv:
# purged_haystack_df.to_csv(path + 'HPAv18_labels_unique.csv')

Total images less: 6


In [None]:
# Options 3: modifying csv's to include partial path to folders - this way, merging folders can be avoided.

In [125]:
# load HPA_labels.csv to add partial path to Id's
def load_csv(path_to_csv):

    with open(path + csv_file, 'r') as d:
            reader = csv.reader(d)
            HPA_labels = np.vstack(list(reader)) 

    return pd.DataFrame(HPA_labels, columns=['Id', 'Target'])


# add partial path to Id's in csv's
def add_partial_path(df):
    for n in tqdm(df.Id, total = len(df.Id), unit="files"):
        df.Id.replace(n, 'test/' + n, inplace=True) 
    return df

In [None]:
HPA_labels_file = path + 'HPA_labels.csv'
HPAv18_uniques_labels_file = path + 'HPAv18_labels_unique.csv'

HPA_labels_df = load_csv(HPA_labels_file)
HPAv18_labels_df = load_csv(HPAv18_uniques_labels_file)

In [None]:
mod_HPA = add_partial_path(HPA_labels_df)
mod_HPAv18_purged = add_partial_path(HPAv18_labels_df)

In [None]:
# mod_HPA.to_csv(path + 'HPA_Kaggle_labels_with_path')
# mod_HPAv18_purged.to_csv(path + 'HPAv18_labels_with_path.csv')

In [None]:
for im in dupe_df.duplicates:
    for d in im:
        print(d)

In [None]:
# THIS DELETES FILES!!!

for im in dupe_df.duplicates:
    for d in im:
        targ = (path + d)
        
#         !rm $targ # ---> activate to arm the function!!!!

#         print(f'deleted duplicate: {os.path.basename(targ)}')


In [None]:
v = needles[0][1]
u = haystack[0][1]


dist = np.array([hamming_distance(v,h) for h in haystack[:,1]])
idx = np.flatnonzero(dist < 10) # better than np.where() ?!

dada = '-'.join(list(haystack[:,0][idx]))
dada
# list(haystack[idx,0])[0]


In [None]:
# compiling file-list from Kaggle_HPA_train, test and HPAv18_external

def read_files(base_path, csvs):
    
    for f in folders:
        
        with open(path + f, 'r') as d:
            read = list(csv.reader(d))
            flist.append(read)    
            
    return np.vstack(flist[0])      