# Function

In [216]:
# Check Duplicate Picture in Directory

# mode = 'same'
# Use Cryptographic hashing algorithms in 'hashlib'
# Editing image won't be count as the duplicated image.

# mode = 'similar'
# Use Perceptual hashing algorithms
# Hash value of editing image will be close to the original image.

import os
import hashlib
from PIL import Image
import imagehash
import numpy as np

class duplicate():
    #################################################################
    # init
    def __init__(self, image_folder_path):
        

        try:
            get_ipython
            self.current_path = os.getcwd() # For test function in .ipynb
        except:
            self.current_path = os.path.dirname(os.path.realpath(__file__)) # For .py
            
        self.current_path = os.path.join(self.current_path, image_folder_path)

        self.remove_filename_list = [] # List of similar image except original one
        self.similar_group_dict = {} # Group of similar image including original one

    #################################################################
    # Find    
    def find(self, mode = 'same', distance = 0, phash_size = 16):
        
        num = 0
        filename_hash = dict()
        image_list = os.listdir(self.current_path)
        
        ###########################

        if mode == 'same':

            hash_keys = dict()
            duplicate_group = dict()
            self.remove_filename_list = []

            for index, filename in enumerate(image_list):

                file_path = os.path.join(self.current_path, filename)

                if os.path.isfile(file_path):
                    with open(file_path, 'rb') as f:
                        filehash = hashlib.md5(f.read()).hexdigest()

                    filename_hash[filename] = filehash

                    if filehash not in hash_keys:
                        hash_keys[filehash] = index
                        
                    else:
                        self.remove_filename_list.append(filename)

            set_hash = set(filename_hash.values())

            for h in set_hash:
                duplicate_group[h] = [k for k in filename_hash.keys() if filename_hash[k] == h]

            for val in duplicate_group.values():
                if len(val) > 1:
                    self.similar_group_dict[num] = val
                    num = num + 1

            ############
            # print

            num_duplicate = len(self.remove_filename_list)
            num_all = len(filename_hash)
            percentage = np.round(num_duplicate/num_all * 100, 2)

            print('There are {} duplicated images from {} images which is around {} %.'.format(num_duplicate, num_all,percentage))

            return self.remove_filename_list, self.similar_group_dict

        ###########################

        if mode == 'similar':
            
            temp_filename_hash = dict()
            temp_filename_list = []
            self.remove_filename_list = []

            print('The accepted distance is {}'.format(distance))
            
            ############
            # Find phash
            for filename in image_list:

                file_path = os.path.join(self.current_path, filename)
                
                if os.path.isfile(file_path):
                    image_file = Image.open(file_path)                        
                    phash = imagehash.phash(image_file, hash_size = phash_size)
                    filename_hash[filename] = phash
                    temp_filename_hash[filename] = phash
            
            ############        
            # Find similarity between image using hamming distance (of phash)
            
            sort_filename_hash = sorted(filename_hash)
            
            for file_first in sort_filename_hash:
                
                if file_first in temp_filename_hash:
                
                    temp_similar_list = []
                    temp_similar_list.append(file_first)
                    temp_filename_list.append(file_first)
                    temp_filename_hash.pop(file_first)

                    image_first = filename_hash[file_first]
                
                for file_second in sort_filename_hash:
                    
                    if file_second not in temp_filename_list:
                        
                        image_second = filename_hash[file_second]
                        
                        hamming_distance = image_first - image_second
                        
                        if hamming_distance <= distance:
                            temp_similar_list.append(file_second)
                            temp_filename_list.append(file_second)

                if len(temp_similar_list) > 1:
                    self.similar_group_dict[num] = temp_similar_list

                    for _item in temp_similar_list[1:]:
                        self.remove_filename_list.append(_item)

                    num = num + 1
            
            ############
            # print

            num_duplicate = len(self.remove_filename_list)
            num_all = len(filename_hash)
            percentage = np.round(num_duplicate/num_all * 100, 2)

            print('There are {} similar images in distance from {} images which is around {} %.'.format(num_duplicate, num_all,percentage))

            return self.remove_filename_list, self.similar_group_dict

    #################################################################
    # Get           
    def get(self):
            
        return self.similar_group_dict, self.remove_filename_list
            
                   
    #################################################################
    # Show    
    def show(self, max_sample_case = 1, max_sample_each_case = 1, size = 1):
        
        try:
            get_ipython
#             matplotlib show duplicate or similar picture
            print(self.similar_group_dict)
        except:
            print('Please run it in notebook')

    #################################################################
    # Move 
    def move_to_folder(self):
        pass

    #################################################################
    # Remove    
    def remove_in_folder(self):
        for filename in self.remove_filename_list:
            file_path = os.path.join(self.current_path, filename)
            os.remove(file_path)

# Credit: https://medium.com/@urvisoni/removing-duplicate-images-through-python-23c5fdc7479e

In [217]:
fig, axs = plt.subplots(5, 5, figsize=(20, 20))

col = -1

for index, image, wrong, right in zip(range(25), wrong_list1[:25], wrong_label1[:25], right_label1[:25]):
    path = os.path.join(training_dir, image)
    image = Image.open(path)
    row = index%5
    if row == 0:
        col = col + 1

    axs[row,col].imshow(np.array(image))
    axs[row,col].set_title('Predict as {}, Actual {}'.format(wrong[0], right))
    axs[row,col].grid(False)

plt.show()

NameError: name 'plt' is not defined

# Test Function

In [220]:
pic_path = '.\image_data'

In [221]:
my_dup = duplicate(pic_path)

In [222]:
remove_list, similar_group = my_dup.find(mode = 'same')

There are 4 duplicated images from 30 images which is around 13.33 %.


In [223]:
remove_list

['q_copy1.jpg', 'q_copy2.jpg', 'r_copy.jpg', 's_copy.jpg']

In [224]:
similar_group

{0: ['s.jpg', 's_copy.jpg'],
 1: ['q.jpg', 'q_copy1.jpg', 'q_copy2.jpg'],
 2: ['r.jpg', 'r_copy.jpg']}

In [225]:
remove_list, similar_group = my_dup.find(mode = 'similar', distance = 0)

The accepted distance is 0
There are 4 similar images in distance from 30 images which is around 13.33 %.


In [226]:
remove_list

['q_copy1.jpg', 'q_copy2.jpg', 'r_copy.jpg', 's_copy.jpg']

In [227]:
similar_group

{0: ['q.jpg', 'q_copy1.jpg', 'q_copy2.jpg'],
 1: ['r.jpg', 'r_copy.jpg'],
 2: ['s.jpg', 's_copy.jpg']}

# Usage

In [233]:
# Test Pypi

# !pip install --upgrade --force-reinstall --index-url https://test.pypi.org/simple/ --no-deps goldfish

Looking in indexes: https://test.pypi.org/simple/
Collecting goldfish
  Downloading https://test-files.pythonhosted.org/packages/96/2b/071d8142c9fa96b85b40fbc08393aba01f2df7dbd1bb156b6809a410958e/goldfish-0.0.3-py3-none-any.whl (5.8 kB)
Installing collected packages: goldfish
  Attempting uninstall: goldfish
    Found existing installation: goldfish 0.0.2
    Uninstalling goldfish-0.0.2:
      Successfully uninstalled goldfish-0.0.2
Successfully installed goldfish-0.0.3


In [None]:
# Pypi

# !pip install goldfish

In [1]:
from goldfish.images.duplicate import duplicate

In [4]:
remove_list, similar_group = duplicate('.\image_data').find(mode = 'same')

There are 4 duplicated images from 30 images which is around 13.33 %.


In [6]:
remove_list

['q_copy1.jpg', 'q_copy2.jpg', 'r_copy.jpg', 's_copy.jpg']

In [5]:
similar_group

{0: ['s.jpg', 's_copy.jpg'],
 1: ['q.jpg', 'q_copy1.jpg', 'q_copy2.jpg'],
 2: ['r.jpg', 'r_copy.jpg']}

In [7]:
remove_list, similar_group = duplicate('.\image_data').find(mode = 'similar', distance = 10)

The accepted distance is 10
There are 5 similar images in distance from 30 images which is around 16.67 %.


In [8]:
remove_list

['q_copy1.jpg', 'q_copy2.jpg', 'r_copy.jpg', 's_copy.jpg', 'x_edit2.jpg']

In [9]:
similar_group

{0: ['q.jpg', 'q_copy1.jpg', 'q_copy2.jpg'],
 1: ['r.jpg', 'r_copy.jpg'],
 2: ['s.jpg', 's_copy.jpg'],
 3: ['x.jpg', 'x_edit2.jpg']}