In [1]:
# !pip install opencv-python pillow numpy torch torchvision cupy

from PIL import Image
from PIL import UnidentifiedImageError
import numpy as np
import os
import zipfile
import torch
import torch.nn.functional as F
import cupy as cp
from torchvision import transforms

# Suppress the DecompressionBombWarning
Image.MAX_IMAGE_PIXELS = None

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [2]:
def load_and_convert_to_grayscale(image_path):
    try:
        image = Image.open(image_path).convert('L')
        image = np.asarray(image)  # Convert to NumPy array
        return torch.tensor(image, device=device, dtype=torch.float32)  # Convert to PyTorch tensor, move to GPU, and set dtype to float32
    except UnidentifiedImageError as e:
        print(f"Skipping image {image_path}: {e}")
        return None  # Return None if image loading fails

In [3]:
# Define thresholds based on observed values
high_edge_threshold = 1500000
moderate_edge_threshold = 1200000
low_pixel_threshold = 50000000
moderate_pixel_threshold = 20000000
low_entropy_threshold = 0.25
moderate_entropy_threshold = 0.3
high_brightness_threshold = 244  # Adjusted based on sample data
low_brightness_threshold = 238   # Adjusted based on sample data

In [4]:
def calculate_edge_density(image):
    image = image.unsqueeze(0).unsqueeze(0)  # Add batch and channel dimensions

    # Define Sobel kernels
    sobel_kernel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], device=device, dtype=torch.float32).view(1, 1, 3, 3)
    sobel_kernel_y = torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], device=device, dtype=torch.float32).view(1, 1, 3, 3)

    # Apply Sobel kernels
    edges_x = F.conv2d(image, sobel_kernel_x, padding=1)
    edges_y = F.conv2d(image, sobel_kernel_y, padding=1)

    edges = torch.sqrt(edges_x**2 + edges_y**2)
    return edges.sum()

def calculate_non_zero_pixel_count(image):
    return torch.sum(image != 0)

def calculate_entropy(image):
    image_cp = cp.array(image.cpu().numpy())  # Convert to CuPy array
    hist, _ = cp.histogram(image_cp, bins=256, range=(0, 256))
    prob = hist / hist.sum()
    entropy = -cp.sum(prob * cp.log2(prob + 1e-10))
    return torch.tensor(entropy, device=device)  # Convert back to PyTorch tensor

def calculate_brightness(image):
    return torch.mean(image)

def save_image(image_path, category_folder):
    image_name = os.path.basename(image_path)
    output_path = os.path.join(category_folder, image_name)
    Image.open(image_path).save(output_path)

In [5]:
def classify_image(image, index, filename):
    edge_density = calculate_edge_density(image)
    non_zero_pixels = calculate_non_zero_pixel_count(image)
    entropy = calculate_entropy(image)
    brightness = calculate_brightness(image)

    # Debugging prints with index and filename
    print(f"\nIndex: {index}, Filename: {filename}")
    print(f"Edge Density: {edge_density.item()}")
    print(f"Non-Zero Pixels: {non_zero_pixels.item()}")
    print(f"Entropy: {entropy.item()}")
    print(f"Brightness: {brightness.item()}")

    if (edge_density > high_edge_threshold and non_zero_pixels < low_pixel_threshold) or entropy < low_entropy_threshold or brightness > high_brightness_threshold:
        return 'most_tempered'
    elif (moderate_edge_threshold < edge_density <= high_edge_threshold or
          moderate_pixel_threshold < non_zero_pixels <= low_pixel_threshold or
          moderate_entropy_threshold < entropy <= low_entropy_threshold or
          low_brightness_threshold <= brightness <= high_brightness_threshold):
        return 'moderately_tempered'
    else:
        return 'less_tempered'

In [6]:
def process_all_images(root_folder, output_zip_path):
    results = {'less_tempered': [], 'moderately_tempered': [], 'most_tempered': []}
    output_base_dir = '/kaggle/working/Categorized_Images'
    
    if not os.path.exists(output_base_dir):
        os.makedirs(output_base_dir)
    
    category_folders = {
        'less_tempered': os.path.join(output_base_dir, 'less_tempered'),
        'moderately_tempered': os.path.join(output_base_dir, 'moderately_tempered'),
        'most_tempered': os.path.join(output_base_dir, 'most_tempered')
    }
    
    for category_folder in category_folders.values():
        if not os.path.exists(category_folder):
            os.makedirs(category_folder)
    
    for index, filename in enumerate(os.listdir(root_folder)):
        if filename.lower().endswith('.tif'):
            image_path = os.path.join(root_folder, filename)
            image = load_and_convert_to_grayscale(image_path)
            if image is None:
                print(filename, "is skipped")
                continue  # Skip to the next image if loading failed
            category = classify_image(image, index, filename)
            results[category].append(image_path)
            save_image(image_path, category_folders[category])
    
    # Create a zip file of the categorized images
    with zipfile.ZipFile(output_zip_path, 'w') as zipf:
        for category, category_folder in category_folders.items():
            zipf.write(category_folder, os.path.basename(category_folder))
            for root, _, files in os.walk(category_folder):
                for file in files:
                    zipf.write(os.path.join(root, file), os.path.join(category, file))
    
    return results, output_zip_path



In [7]:
root_folder = '/kaggle/input/manuscripts-combined/IGNCA_latest'
output_zip_path = '/kaggle/working/Categorized_Images.zip'  # Update with desired output zip path

results, output_zip_path = process_all_images(root_folder, output_zip_path)

print("Categorized images saved to:", output_zip_path)


Index: 0, Filename: manuscripts (4481).tif
Edge Density: 9028506624.0
Non-Zero Pixels: 67956701
Entropy: 0.9191698472707194
Brightness: 169.77650451660156

Index: 1, Filename: manuscripts (4762).tif
Edge Density: 13164577792.0
Non-Zero Pixels: 47419332
Entropy: 0.9963770251862494
Brightness: 118.46790313720703

Index: 2, Filename: manuscripts (1256).tif
Edge Density: 2108514048.0
Non-Zero Pixels: 100553707
Entropy: 0.4427928252918411
Brightness: 231.56454467773438

Index: 3, Filename: manuscripts (4550).tif
Edge Density: 7706615808.0
Non-Zero Pixels: 82133555
Entropy: 0.7124593082556872
Brightness: 205.194580078125

Index: 4, Filename: manuscripts (3225).tif
Edge Density: 2770073088.0
Non-Zero Pixels: 103454698
Entropy: 0.33129298611283875
Brightness: 239.45204162597656

Index: 5, Filename: manuscripts (3608).tif
Edge Density: 1810960896.0
Non-Zero Pixels: 101248753
Entropy: 0.5316778481852873
Brightness: 224.19406127929688

Index: 6, Filename: manuscripts (3947).tif
Edge Density: 336