In [None]:
import fitz  # PyMuPDF
import hashlib
import os

def hash_image(image_data, hash_algorithm='sha256'):
    """
    Generate a hash for image data.
    
    Args:
        image_data: Binary image data
        hash_algorithm: Hash algorithm to use ('md5', 'sha1', 'sha256', etc.)
    
    Returns:
        Hash string of the image
    """
    if hash_algorithm == 'md5':
        hash_obj = hashlib.md5(image_data)
    elif hash_algorithm == 'sha1':
        hash_obj = hashlib.sha1(image_data)
    elif hash_algorithm == 'sha256':
        hash_obj = hashlib.sha256(image_data)
    else:
        raise ValueError(f"Unsupported hash algorithm: {hash_algorithm}")
    
    return hash_obj.hexdigest()

# Open PDF file
pdf_path = "Norfolk_Public_Review_PR_RFI_Launch__Template_V2_.pdf"
output_dir = "extracted_images"
hash_file = "image_hashes.txt"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Open the hash file for writing
with open(hash_file, "w") as hash_f:
    hash_f.write("Filename, Hash (SHA-256), Page Number, Image Index\n")
    
    # Open the PDF
    pdf = fitz.open(pdf_path)
    
    # Dictionary to track hashes for duplicate detection
    hash_to_files = {}
    
    # Track total images
    image_count = 0
    
    # Process each page
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        image_list = page.get_images()
        
        # Process each image on the page
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Hash the image data
            img_hash = hash_image(image_bytes)
            
            # Create a filename
            filename = f"output_image_{page_num}_{img_index}.png"
            file_path = os.path.join(output_dir, filename)
            
            # Track the hash for duplicate detection
            if img_hash in hash_to_files:
                hash_to_files[img_hash].append(filename)
                is_duplicate = True
            else:
                hash_to_files[img_hash] = [filename]
                is_duplicate = False
            
            # Save image information to the hash file
            duplicate_info = " (DUPLICATE)" if is_duplicate else ""
            hash_f.write(f"{filename}, {img_hash}, {page_num+1}, {img_index}{duplicate_info}\n")
            
            # Save the image - you can choose to skip duplicates if desired
            if not is_duplicate:  # Uncomment this if you want to skip saving duplicates
                with open(file_path, "wb") as img_file:
                    img_file.write(image_bytes)
                
            image_count += 1
            print(f"Processed image {image_count}: {filename} - Hash: {img_hash[:10]}...")

    # Write duplicate summary at the end of the hash file
    hash_f.write("\n\nDuplicate Images Summary:\n")
    for img_hash, files in hash_to_files.items():
        if len(files) > 1:
            hash_f.write(f"\nHash {img_hash} appears in:\n")
            for f in files:
                hash_f.write(f"  - {f}\n")

print(f"\nExtracted and hashed {image_count} images from {pdf_path}")
print(f"Image files saved to: {output_dir}")
print(f"Hash information saved to: {hash_file}")

# Close the PDF
pdf.close()

In [2]:
import hashlib
import os
from PIL import Image
import io

def hash_image(image_path, hash_algorithm='md5'):
    """
    Generate a hash for an image file.
    
    Args:
        image_path: Path to the image file
        hash_algorithm: Hash algorithm to use ('md5', 'sha1', 'sha256', etc.)
    
    Returns:
        Hash string of the image
    """
    # Open the image file
    with open(image_path, 'rb') as f:
        image_data = f.read()
    
    # Create hash
    if hash_algorithm == 'md5':
        hash_obj = hashlib.md5(image_data)
    elif hash_algorithm == 'sha1':
        hash_obj = hashlib.sha1(image_data)
    elif hash_algorithm == 'sha256':
        hash_obj = hashlib.sha256(image_data)
    else:
        raise ValueError(f"Unsupported hash algorithm: {hash_algorithm}")
    
    return hash_obj.hexdigest()

# Example usage:
image_dir = "."  # Current directory where images are stored
output_file = "image_hashes.txt"

# Get all PNG files
image_files = [f for f in os.listdir(image_dir) if f.startswith("output_image_") and f.endswith(".png")]

# Calculate hash for each image and write to file
with open(output_file, "w") as out_f:
    for img_file in sorted(image_files):
        img_path = os.path.join(image_dir, img_file)
        img_hash = hash_image(img_path, 'sha256')  # Using SHA-256 for stronger hashing
        out_f.write(f"{img_file}: {img_hash}\n")
        print(f"Hashed {img_file}")

print(f"All hashes written to {output_file}")

All hashes written to image_hashes.txt
