In [1]:
from PIL import Image
import imagehash
from collections import defaultdict
import os
from pillow_heif import register_heif_opener

In [2]:
register_heif_opener()

In [2]:
def calculate_image_hash(image_path):
    """Calculate perceptual hash for an image."""
    img = Image.open(image_path)
    return imagehash.average_hash(img)

In [3]:
import pandas as pd

In [5]:
def group_similar_images(image_paths):
    """Group similar images based on perceptual hash."""
    image_hashes = {path: calculate_image_hash(path) for path in image_paths}

    grouped_images = defaultdict(list)
    unique_images = []

    for path1 in image_paths:
        is_unique = True

        for path2 in unique_images:
            if are_images_similar(image_hashes[path1], image_hashes[path2]):
                grouped_images[path2].append(path1)
                is_unique = False
                break

        if is_unique:
            unique_images.append(path1)

    return unique_images, grouped_images


In [6]:
def are_images_similar(hash1, hash2, threshold=10):
    """Check if two image hashes are similar based on a threshold."""
    return hash1 - hash2 < threshold

In [3]:
image_folder_path = "./images/doppelganger/"
image_paths = []

In [4]:
dir_with_images = os.scandir(image_folder_path)

In [35]:
for file in dir_with_images:
    image_paths.append(f"{image_folder_path}{file.name}")

In [40]:
unique_images, grouped_images = group_similar_images(image_paths)


In [41]:
unique_images

['./images/doppelganger/2-1.JPG',
 './images/doppelganger/1.HEIC',
 './images/doppelganger/4-1.JPG',
 './images/doppelganger/5.JPG',
 './images/doppelganger/3-1.JPG',
 './images/doppelganger/1-2.HEIC',
 './images/doppelganger/3.JPG',
 './images/doppelganger/5-1.JPG']

In [42]:
grouped_images

defaultdict(list,
            {'./images/doppelganger/2-1.JPG': ['./images/doppelganger/2-2.JPG',
              './images/doppelganger/2.JPG'],
             './images/doppelganger/4-1.JPG': ['./images/doppelganger/4.JPG']})

In [67]:
images_hashes = {}
matrix_img = {}
for p in image_paths:
    images_hashes[os.path.basename(p)] = calculate_image_hash(p)
    matrix_img[os.path.basename(p)] = {}

In [85]:
for img in images_hashes:
    for img2 in images_hashes:
        matrix_img[img][img2] = int(str(images_hashes[img2]), 16) - int(str(images_hashes[img]), 16)
        

In [86]:
matrix_img_df = pd.DataFrame(matrix_img)

In [87]:
matrix_img_df

Unnamed: 0,2-1.JPG,1.HEIC,2-2.JPG,4-1.JPG,4.JPG,5.JPG,3-1.JPG,2.JPG,1-2.HEIC,3.JPG,5-1.JPG
2-1.JPG,0,2476771658941726233,-33554432,-63863231062254946,-68384285432309092,9114786130887384353,-71980917776970785,0,2334835720024821012,-71483667821683681,2170204874447994145
1.HEIC,-2476771658941726233,0,-2476771658975280665,-2540634890003981179,-2545155944374035325,6638014471945658120,-2548752576718697018,-2476771658941726233,-141935938916905221,-2548255326763409914,-306566784493732088
2-2.JPG,33554432,2476771658975280665,0,-63863231028700514,-68384285398754660,9114786130920938785,-71980917743416353,33554432,2334835720058375444,-71483667788129249,2170204874481548577
4-1.JPG,63863231062254946,2540634890003981179,63863231028700514,0,-4521054370054146,9178649361949639299,-8117686714715839,63863231062254946,2398698951087075958,-7620436759428735,2234068105510249091
4.JPG,68384285432309092,2545155944374035325,68384285398754660,4521054370054146,0,9183170416319693445,-3596632344661693,68384285432309092,2403220005457130104,-3099382389374589,2238589159880303237
5.JPG,-9114786130887384353,-6638014471945658120,-9114786130920938785,-9178649361949639299,-9183170416319693445,0,-9186767048664355138,-9114786130887384353,-6779950410862563341,-9186269798709068034,-6944581256439390208
3-1.JPG,71980917776970785,2548752576718697018,71980917743416353,8117686714715839,3596632344661693,9186767048664355138,0,71980917776970785,2406816637801791797,497249955287104,2242185792224964930
2.JPG,0,2476771658941726233,-33554432,-63863231062254946,-68384285432309092,9114786130887384353,-71980917776970785,0,2334835720024821012,-71483667821683681,2170204874447994145
1-2.HEIC,-2334835720024821012,141935938916905221,-2334835720058375444,-2398698951087075958,-2403220005457130104,6779950410862563341,-2406816637801791797,-2334835720024821012,0,-2406319387846504693,-164630845576826867
3.JPG,71483667821683681,2548255326763409914,71483667788129249,7620436759428735,3099382389374589,9186269798709068034,-497249955287104,71483667821683681,2406319387846504693,0,2241688542269677826


In [81]:
def calculate_image_hashes(image_path):

    img = Image.open(image_path)

    average_hash = imagehash.average_hash(img)
    dhash = imagehash.dhash(img)
    phash = imagehash.phash(img)
    colorhash = imagehash.colorhash(img)

    return {
        "average_hash": int(str(average_hash), 16),
        "dhash": int(str(dhash), 16),
        "phash": int(str(phash), 16),
        "colorhash": int(str(colorhash), 16)
    }

In [82]:
%%time
hhh = calculate_image_hashes(image_paths[0])

CPU times: user 623 ms, sys: 86 ms, total: 709 ms
Wall time: 732 ms


In [83]:
hhh

{'average_hash': 18374403486693989407,
 'dhash': 15874190554173567158,
 'phash': 15920650585210538804,
 'colorhash': 532609499136}