In [42]:
import os
from PIL import Image
import imagehash
from decimal import Decimal
from typing import Tuple


In [43]:
import inspect

def fname():
    return inspect.currentframe().f_back.f_code.co_name



In [44]:
crs = [
    "data-copyrighted-imgs/2023-10-15-luporini.jpg",
    "data-copyrighted-imgs/2023-08-10-erni.jpeg"
]

In [45]:

type TCompositeHash = Tuple[str, str, str, str]

In [46]:
def compute_composite_hash(image_path: str) -> TCompositeHash:
    img = Image.open(image_path)
    hashes = (imagehash.average_hash(img), imagehash.phash(img), imagehash.dhash(img), imagehash.whash(img))

    return tuple(f"{h}" for h in hashes)

In [47]:
h = imagehash.average_hash(Image.open(crs[0]))

In [48]:
f"{h}"

'fffcf8182c082020'

In [49]:
def serialize_hash(hash: TCompositeHash) -> str:
    return ", ".join(hash)

def deserialize_hash(hash_str: str) -> TCompositeHash:
    return tuple(hash_str.split(", "))

In [50]:
def _decode_single_hash(hash_str: str) -> imagehash.ImageHash:
    if not hash_str:
        raise ValueError(f"{__name__}::Empty hash string provided")
    try:
        return imagehash.hex_to_hash(hash_str)
    except Exception as e:
        raise e.__class__(f"{fname()}::Unexpected '{e.__class__.__name__}' while decoding hash string '{hash_str}': {e}") from e

def _decode_hash(composite_hash_str: str, separator: str, lenght: int) -> TCompositeHash:
    if not composite_hash_str:
        raise ValueError(f"{fname()}::Empty hash string provided")

    attempt = tuple(_decode_single_hash(h) for h in composite_hash_str.split(separator))

    if len(attempt) != lenght:
        raise ValueError(f"{fname()}::Invalid hash string provided, expected {lenght} hashes, got {len(attempt)}")

    return attempt

In [51]:
from typing import Literal


def _single_hash_difference(hash1: str, hash2: str) -> Decimal:
    h1 = _decode_single_hash(hash1)
    h2 = _decode_single_hash(hash2)
    diff = h1 - h2
    return Decimal(diff)


def hash_difference(hash1: TCompositeHash, hash2: TCompositeHash, lenght: int) -> Decimal:
    if not hash1 or not hash2:
        raise ValueError(f"{fname()}::At least one empty hash string provided: '{hash1}', '{hash2}'")

    if len(hash1) != lenght or len(hash2) != lenght:
        raise ValueError(f"{fname()}::At least one invalid composite hash string provided, expected {lenght} serialized hashes, got {len(hash1)} and {len(hash2)}")

    diff = (_single_hash_difference(h1, h2) for h1, h2 in zip(hash1, hash2))

    avg_diff = Decimal(sum(diff)) / Decimal(len(hash1))
    
    return avg_diff


type THashComparison = Literal[
    "identical",
    "similar",
    "different"
]


def hash_categorization(hash1: TCompositeHash, hash2: TCompositeHash, identity_threshold: int, similarity_threshold: int, lenght: int) -> THashComparison:

    diff = hash_difference(hash1, hash2, lenght) 

    result: THashComparison

    if diff <= identity_threshold:
        result = "identical"
    elif diff <= similarity_threshold:
        result = "similar"
    else:
        result = "different"

    return result
    


In [52]:
h1 = compute_composite_hash(crs[0])
h2 = compute_composite_hash(crs[1])

In [54]:
hash_difference(h1, h2, 4)

Decimal('30.25')

In [56]:
IDENTITY_THRESHOLD = 5
SIMILARITY_THRESHOLD = 10

hash_categorization(h1, h1, IDENTITY_THRESHOLD, SIMILARITY_THRESHOLD, 4)

'identical'

In [57]:
from typing import Dict, NamedTuple


class KnownImage(NamedTuple):
    id: str
    hash: TCompositeHash

type KnownImageComparison = Dict[
    str,  # image id
    THashComparison  # comparison result
]


def all_hash_categorizations(
    source_image_path: str,
    known_image_hashes: Tuple[KnownImage],
    identity_threshold: int,
    similarity_threshold: int,
    lenght: int
) -> KnownImageComparison:
    """
    Compare a source image hash with a list of known image hashes and return the comparison results.
    
    :param source_image_path: Path to the source image.
    :param known_image_hashes: List of known image hashes.
    :param identity_threshold: Threshold for identical images.
    :param similarity_threshold: Threshold for similar images.
    :param lenght: Number of hashes in the composite hash.
    :return: Dictionary with image IDs as keys and comparison results as values.
    """
    
    source_hash = compute_composite_hash(source_image_path)
    
    result = {
        img.id: hash_categorization(source_hash, img.hash, identity_threshold, similarity_threshold, lenght)
        for img in known_image_hashes
    }
    
    return result

In [71]:
known_images = (
    KnownImage(id="luporini-23", hash=compute_composite_hash(crs[0])),
    KnownImage(id="erni-23", hash=compute_composite_hash(crs[1]))
)



In [73]:
d = all_hash_categorizations(
    source_image_path=crs[1],
    known_image_hashes=known_images,
    identity_threshold=IDENTITY_THRESHOLD,
    similarity_threshold=SIMILARITY_THRESHOLD,
    lenght=4
)

In [74]:
print(f"{d}")

{'luporini-23': 'different', 'erni-23': 'identical'}


In [75]:
print(serialize_hash(compute_composite_hash(crs[1])))

f1f1e3cf98f0d800, eceb68cc18b1479c, a323071c30e1b037, f1f1e3cfdcf09000


In [76]:
def filter_hash_categorizations(
    hash_categorizations: KnownImageComparison,
    filter_values: Tuple[THashComparison, ...]
) -> KnownImageComparison:
    
    """
    Filter the hash categorizations based on the provided filter values.
    
    :param hash_categorizations: Dictionary with image IDs as keys and comparison results as values.
    :param filter_values: Tuple of filter values to include in the result.
    :return: Filtered dictionary with image IDs as keys and comparison results as values.
    """
    
    return {img_id: result for img_id, result in hash_categorizations.items() if result in filter_values}
    

In [77]:
filter_hash_categorizations(
    hash_categorizations=d,
    filter_values=("identical", "similar")
)

{'erni-23': 'identical'}