# Step 1: Install Libraries

In [None]:
!pip install opencv-python-headless numpy pillow imagehash scikit-learn


In [None]:
# Install required libraries if not installed
# !pip install opencv-python-headless numpy pillow imagehash scikit-learn

import cv2
import numpy as np
from PIL import Image
import imagehash
from sklearn.cluster import KMeans
import hashlib


# Step 2: Load and Display Images

In [None]:
# Import necessary modules
import os
from PIL import Image
import pandas as pd  # For table display
from IPython.display import display

# Function to load images from a directory
def load_images_from_folder(folder_path):
    images = []
    filenames = []  # Initialize filenames list
    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        img = Image.open(img_path).convert('RGB')
        images.append(img)  # Append the image to the images list
        filenames.append(filename)  # Append the filename to the filenames list
    return images, filenames

# Load images and filenames (modify the path to your folder)
folder_path = r"C:\APPU SELVA\Pongal 2024"  # Use a raw string to handle backslashes
images, filenames = load_images_from_folder(folder_path)

# Display the first image to verify
if images:
    display(images[0])
else:
    print("No images found in the specified folder.")

# Step 3: Convert Images to Hash Values

In [None]:
# Function to convert images to perceptual hash values
def image_to_phash(image):
    return imagehash.phash(image)

# Generate perceptual hashes for each image
image_hashes = [image_to_phash(img) for img in images]

# Extract file extensions
extensions = [os.path.splitext(filename)[1] for filename in filenames]  # Get the extension part of each filename

# Create a DataFrame to display results in table format
df = pd.DataFrame({
    'Filename': filenames,
    'Extension': extensions,
    'Hash Value': [str(hash_val) for hash_val in image_hashes]  # Convert hashes to strings
})

# Display the table
display(df)


In [None]:
!pip install tabulate


In [None]:
# Import necessary modules
#import os
import sqlite3
#import imagehash
#from PIL import Image
import time
#import pandas as pd  # Import pandas for table display

# Function to load images from a directory and retrieve properties
def load_images_from_folder(folder_path):
    images = []
    filenames = []
    extensions = []
    locations = []
    sizes = []
    created_dates = []
    
    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        
        # Open and store the image
        img = Image.open(img_path).convert('RGB') #Convert RGB into Binary images
        images.append(img)
        
        # Collect file properties
        filenames.append(filename)
        extensions.append(os.path.splitext(filename)[1])  # Get the file extension
        locations.append(img_path)  # Store full file path
        sizes.append(os.path.getsize(img_path))  # Get file size in bytes
        created_dates.append(time.ctime(os.path.getctime(img_path)))  # Get file creation date
        
    return images, filenames, extensions, locations, sizes, created_dates

# Function to convert images to perceptual hash values
def image_to_phash(image):
    return imagehash.phash(image)

# Function to create and initialize the database with additional fields
def initialize_database(db_name=""):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    # Create table with additional fields for location, size, and created date
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS ImageInfo (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT NOT NULL,
            hash_value TEXT NOT NULL,
            extension TEXT NOT NULL,
            location TEXT NOT NULL,
            size INTEGER NOT NULL,
            created TEXT NOT NULL
        )
    ''')
    conn.commit()
    return conn

# Function to insert image data into the database
def insert_image_data(conn, filename, hash_value, extension, location, size, created):
    cursor = conn.cursor()
    cursor.execute('''
        INSERT INTO ImageInfo (filename, hash_value, extension, location, size, created)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', (filename, hash_value, extension, location, size, created))
    conn.commit()

# Load images and their details
folder_path = r"C:\APPU SELVA\Pongal 2024"  # Adjust the path as needed
images, filenames, extensions, locations, sizes, created_dates = load_images_from_folder(folder_path)

# Initialize the database
conn = initialize_database()

# Process each image, calculate hash, and store in database
for img, filename, extension, location, size, created in zip(images, filenames, extensions, locations, sizes, created_dates):
    hash_value = str(image_to_phash(img))  # Convert hash to string
    insert_image_data(conn, filename, hash_value, extension, location, size, created)

# Verify by fetching all entries from the database
cursor = conn.cursor()
cursor.execute("SELECT * FROM ImageInfo")
rows = cursor.fetchall()

# Create a pandas DataFrame to display the data in a table format
columns = ['ID', 'Filename', 'Hash Value', 'Extension', 'Location', 'Size (bytes)', 'Created']
df = pd.DataFrame(rows, columns=columns)

# Set display options for a cleaner look in larger tables
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 200)         # Increase width for larger tables
pd.set_option('display.colheader_justify', 'center')  # Center column headers

# Display the DataFrame again
print(df)

# Close the database connection
conn.close()


# Step 4: Store Hashes in a Database (Dictionary)

In [None]:
# Storing hashes in a dictionary with image indices as keys
image_database = {i: img_hash for i, img_hash in enumerate(image_hashes)}

# Print the hash database
print(image_database)

#The purpose of this code is to create an easy-to-access mapping of image indices to their corresponding hash values. 
#This structure can be useful for quickly retrieving the hash of a specific image based on its index, which can facilitate tasks 
#like checking for duplicates or organizing images.


# Step 5: Detect Exact Duplicates Using Cryptographic Hashes

In [None]:
# Function to generate cryptographic hash of an image
def cryptographic_hash(image):
    # Convert image to bytes and apply SHA-256
    img_bytes = np.array(image).tobytes()
    return hashlib.sha256(img_bytes).hexdigest()

# Generate SHA-256 hashes
crypto_hashes = [cryptographic_hash(img) for img in images]

# Print SHA-256 hash values
for i, c_hash in enumerate(crypto_hashes):
    print(f"Image {i+1}: Cryptographic Hash = {c_hash}")


In [1]:
pip install nbstripout # To reduce the file size


Collecting nbstripout
  Downloading nbstripout-0.8.0-py2.py3-none-any.whl.metadata (19 kB)
Downloading nbstripout-0.8.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: nbstripout
Successfully installed nbstripout-0.8.0
Note: you may need to restart the kernel to use updated packages.


# Step 6: Detect Near-Duplicates Using Hamming Distance

In [None]:
# Function to calculate Hamming distance between two hashes
def hamming_distance(hash1, hash2):
    # Retrieve the underlying hash values
    hash1_bin = hash1.hash.flatten()  # Get the binary array of the hash
    hash2_bin = hash2.hash.flatten()  # Same for the second hash

    # Calculate the Hamming distance by counting differing bits
    return sum(el1 != el2 for el1, el2 in zip(hash1_bin, hash2_bin))

# Identify near-duplicates by comparing Hamming distances
threshold = 10  # Define a threshold for similarity
near_duplicates = []

for i in range(len(image_hashes)):
    for j in range(i + 1, len(image_hashes)):
        distance = hamming_distance(image_hashes[i], image_hashes[j])
        if distance < threshold:
            near_duplicates.append((i, j, distance))

# Display pairs of similar images
print("Near-duplicate images:")
for i, j, distance in near_duplicates:
    print(f"Images {i+1} and {j+1} have a Hamming distance of {distance}")


# Step 7: Cluster Similar Images Using K-Means Clustering

In [None]:
# Convert hashes to a numeric format for clustering
hash_vectors = np.array([hash.hash.flatten() for hash in image_hashes])

# Perform K-Means clustering
num_clusters = 3  # Define the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(hash_vectors)

# Assign images to clusters
cluster_labels = kmeans.labels_

# Display cluster assignments
print("Cluster assignments:")
for i, label in enumerate(cluster_labels):
    print(f"Image {i+1} is in cluster {label}")


# Step 8: Visualize Clusters 

In [None]:
import matplotlib.pyplot as plt

# Function to display images in clusters
def display_clusters(images, labels):
    clusters = {}
    for i, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(images[i])

    for cluster_id, cluster_images in clusters.items():
        print(f"\nCluster {cluster_id + 1}:")
        for img in cluster_images:
            plt.imshow(img)
            plt.axis('off')
            plt.show()

# Display images in clusters
display_clusters(images, cluster_labels)
