In [None]:
%pip install pytesseract

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, itertools, shutil, cv2, pytesseract
import numpy as np

from os.path import basename
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.models import mobilenet_v3_small
from tqdm import tqdm
from transformers import CLIPModel, CLIPProcessor

# Define model and transform only once
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

_model = mobilenet_v3_small(pretrained=True)
_model = torch.nn.Sequential(*list(_model.children())[:-1])
_model.to(device).eval()

_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

def move_text_heavy_images(parent_dir, text_ratio_thresh=0.000035):
    text_heavy_dir = os.path.join(parent_dir, "text_heavy")
    os.makedirs(text_heavy_dir, exist_ok=True)

    valid_exts = ('.jpg', '.jpeg', '.png')
    image_files = [f for f in os.listdir(parent_dir) if f.lower().endswith(valid_exts)]

    for img_file in tqdm(image_files, desc="Scanning images", unit="img"):
        img_path = os.path.join(parent_dir, img_file)
        try:
            img = cv2.imread(img_path)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            text = pytesseract.image_to_string(gray)
            text_len = len(text.strip())
            ratio = text_len / (img.shape[0] * img.shape[1])

            # Print ratio for each image
            tqdm.write(f"{img_file} --> Text Ratio: {ratio:.5f}")

            if ratio > text_ratio_thresh:
                shutil.move(img_path, os.path.join(text_heavy_dir, img_file))
        except Exception as e:
            tqdm.write(f"Skipping {img_file}: {e}")

class ImageDataset(Dataset):
    def __init__(self, image_paths, transform):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        image = Image.open(path).convert("RGB")
        return self.transform(image), path

def group_similar_images(input_dir, eps=0.1, batch_size=32, model=_model, transform=_transform, use_gpu=True):
    if model is None or transform is None:
        raise ValueError("Model and transform must be provided for optimized usage.")

    device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu')

    valid_exts = ('.jpg', '.jpeg', '.png')
    image_paths = [os.path.join(input_dir, f)
                   for f in os.listdir(input_dir)
                   if f.lower().endswith(valid_exts)]

    if not image_paths:
        print("No valid images found.")
        return

    dataset = ImageDataset(image_paths, transform)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    embeddings, final_paths = [], []

    for batch_imgs, batch_paths in loader:
        batch_imgs = batch_imgs.to(device)
        with torch.no_grad():
            batch_emb = model(batch_imgs).squeeze(-1).squeeze(-1).cpu().numpy()
            batch_emb = batch_emb / np.linalg.norm(batch_emb, axis=1, keepdims=True)
        embeddings.append(batch_emb)
        final_paths.extend(batch_paths)

    embeddings = np.vstack(embeddings).astype(np.float32)

    used = set()
    cluster_id = 1
    n = len(final_paths)

    for i in range(n):
        if i in used:
            continue

        cluster_indices = [i]
        used.add(i)

        for j in range(i + 1, n):
            if j in used:
                continue
            sim = np.dot(embeddings[i], embeddings[j])
            if 1 - sim <= eps:
                cluster_indices.append(j)
                used.add(j)

        if len(cluster_indices) > 1:
            group_dir = os.path.join(input_dir, f"group_{cluster_id}")
            os.makedirs(group_dir, exist_ok=True)
            print(f"\n📂 Cluster {cluster_id}: {len(cluster_indices)} images")

            # Print similarities within the group
            for a, b in itertools.combinations(cluster_indices, 2):
                sim = np.dot(embeddings[a], embeddings[b])
                print(f"  {basename(final_paths[a])} ↔ {basename(final_paths[b])} → sim: {sim:.4f}, dist: {1 - sim:.4f}")

            # Move files
            for idx in cluster_indices:
                shutil.move(final_paths[idx], os.path.join(group_dir, os.path.basename(final_paths[idx])))

            cluster_id += 1

    print(f"\n[✓] Grouped {len(used)} images into {cluster_id - 1} strict clusters.")

def pick_best_image_per_folder(parent_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load CLIP model and processor
    _clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device).eval()
    _clip_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Prompt can be tuned here
    text_prompt = "a beautiful, sharp, well-composed photo with attractive facial expressions, tall & slim body, aesthetic eyes, flawless skin & background."
    text_inputs = _clip_proc(text=[text_prompt], return_tensors="pt").to(device)
    with torch.no_grad():
        text_feats = _clip_model.get_text_features(
            input_ids=text_inputs["input_ids"],
            attention_mask=text_inputs["attention_mask"]
        )
        text_feats = text_feats / text_feats.norm(p=2, dim=-1, keepdim=True)

    # Loop over each subfolder
    for subfolder in sorted(os.listdir(parent_dir)):
        subfolder_path = os.path.join(parent_dir, subfolder)
        if not os.path.isdir(subfolder_path):
            continue

        print(f"\n📁 Evaluating folder: {subfolder}")
        image_scores = []
        image_paths = []

        # Collect all images in this subfolder
        for filename in sorted(os.listdir(subfolder_path)):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                path = os.path.join(subfolder_path, filename)
                try:
                    img = Image.open(path).convert("RGB")
                except:
                    print(f"  ⚠️ Skipping unreadable image: {filename}")
                    continue

                img_inputs = _clip_proc(images=img, return_tensors="pt").to(device)

                with torch.no_grad():
                    img_feats = _clip_model.get_image_features(pixel_values=img_inputs["pixel_values"])
                    img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
                    score = float(cosine_similarity(img_feats.cpu(), text_feats.cpu()).squeeze())

                image_scores.append(score)
                image_paths.append(path)

                print(f"  📷 {filename} --> Score: {score:.4f}")

        # Mark the best image
        if image_scores:
            best_idx = int(torch.tensor(image_scores).argmax())
            best_path = image_paths[best_idx]
            base, ext = os.path.splitext(os.path.basename(best_path))
            best_name = os.path.join(os.path.dirname(best_path), f"{base}_best{ext}")
            os.rename(best_path, best_name)
            print(f"\n✅ Best image in '{subfolder}': {os.path.basename(best_name)} (Score: {image_scores[best_idx]:.4f})")

def move_best_and_clean(parent_dir):
    # Walk through all subdirectories
    for root, dirs, files in os.walk(parent_dir, topdown=False):
        # Skip parent directory
        if root == parent_dir:
            continue

        # Check if this is a last-level subdirectory (has no further subdirs)
        if not dirs:
            for file in files:
                if file.lower().startswith("best_"):
                    src_path = os.path.join(root, file)
                    dest_filename = file[len("best_"):]
                    dest_path = os.path.join(parent_dir, dest_filename)

                    # Avoid overwriting in parent dir
                    counter = 1
                    while os.path.exists(dest_path):
                        name, ext = os.path.splitext(dest_filename)
                        dest_path = os.path.join(parent_dir, f"{name}_{counter}{ext}")
                        counter += 1

                    shutil.move(src_path, dest_path)

            # Delete the entire subdirectory and its contents
            shutil.rmtree(root)

    print("[✓] Done moving 'best_' files and cleaning subdirectories.")

def revert(parent_dir):
    for root, dirs, files in os.walk(parent_dir, topdown=False):
        if root == parent_dir:
            continue

        for file in files:
            src_path = os.path.join(root, file)
            dest_path = os.path.join(parent_dir, file)

            # Handle filename conflict
            if os.path.exists(dest_path):
                base, ext = os.path.splitext(file)
                i = 1
                while True:
                    new_name = f"{base}_{i}{ext}"
                    new_dest = os.path.join(parent_dir, new_name)
                    if not os.path.exists(new_dest):
                        dest_path = new_dest
                        break
                    i += 1
            os.rename(src_path, dest_path)

        # Remove the empty subdirectories
        try:
            os.rmdir(root)
        except OSError:
            pass

In [None]:
input_dir = r"/content/drive/MyDrive/Photos_PJ"

In [None]:
t1 = time.time()
print(f"Processing images in: {input_dir}")

print("Filtering text-heavy images...")
move_text_heavy_images(input_dir)
t2 = time.time()
print("Text-heavy images moving took: {t2 - t1:.2f} seconds\n")

print("Grouping similar images...")
group_similar_images(input_dir)
t3 = time.time()
print("Grouping similar images took: {t3 - t2:.2f} seconds\n")

print("Picking best images from similar ones...")
pick_best_image_per_folder(input_dir)
t4 = time.time()
print("Picking best images took: {t4 - t3:.2f} seconds\n")

print("Cleaning up and moving best images...")
move_best_and_clean(input_dir)
t5 = time.time()
print(f"Cleaning bad pics took: {t5 - t4:.2f} seconds\n")

## Past operations --> for optimizations only

In [None]:
def print_cluster_similarities(image_paths, labels, embeddings):
    label_to_indices = {}
    for idx, label in enumerate(labels):
        if label == -1:
            continue  # Skip noise
        label_to_indices.setdefault(label, []).append(idx)

    for label, indices in label_to_indices.items():
        if len(indices) < 2:
            continue  # Skip singleton clusters

        print(f"\nCluster {label+1} — {len(indices)} images")
        for i, j in itertools.combinations(indices, 2):
            emb_i, emb_j = embeddings[i], embeddings[j]
            sim = np.dot(emb_i, emb_j)  # cosine similarity (L2-normalized)
            print(f"  {basename(image_paths[i])} ↔ {basename(image_paths[j])}  →  similarity: {sim:.4f}, distance: {1-sim:.4f}")

In [None]:
# Run clustering
t1 = time.time()
group_similar_images(dir, eps=0.1, batch_size=32, model=_model, transform=_transform)
print(f"Completed in {time.time() - t1:.2f} seconds.")


📂 Cluster 1: 2 images
  IMG_20250621_165922782.jpg ↔ IMG_20250621_170008508.jpg → sim: 0.9244, dist: 0.0756

📂 Cluster 2: 3 images
  IMG_20250621_122354245.jpg ↔ IMG_20250621_122348827.jpg → sim: 0.9072, dist: 0.0928
  IMG_20250621_122354245.jpg ↔ IMG_20250621_122324444.jpg → sim: 0.9159, dist: 0.0841
  IMG_20250621_122348827.jpg ↔ IMG_20250621_122324444.jpg → sim: 0.9158, dist: 0.0842

📂 Cluster 3: 27 images
  IMG_20250718_135847115.jpg ↔ IMG_20250718_135747072.jpg → sim: 0.9663, dist: 0.0337
  IMG_20250718_135847115.jpg ↔ IMG_20250718_135820431.jpg → sim: 0.9119, dist: 0.0881
  IMG_20250718_135847115.jpg ↔ IMG_20250718_135814548.jpg → sim: 0.9146, dist: 0.0854
  IMG_20250718_135847115.jpg ↔ IMG_20250718_135812948.jpg → sim: 0.9056, dist: 0.0944
  IMG_20250718_135847115.jpg ↔ IMG_20250718_135815851.jpg → sim: 0.9121, dist: 0.0879
  IMG_20250718_135847115.jpg ↔ IMG_20250718_135808168.jpg → sim: 0.9722, dist: 0.0278
  IMG_20250718_135847115.jpg ↔ IMG_20250718_135806960.jpg → sim: 0.963

In [None]:
t1 = time.time()
pick_best_image_per_folder(dir)
print(time.time()-t1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]


📁 Evaluating folder: group_1
  📷 IMG_20250621_165922782.jpg --> Score: 0.1320
  📷 IMG_20250621_170008508.jpg --> Score: 0.1540

✅ Best image in 'group_1': best_IMG_20250621_170008508.jpg (Score: 0.1540)

📁 Evaluating folder: group_10
  📷 IMG_20250722_101949160.jpg --> Score: 0.1822
  📷 IMG_20250722_101953933.jpg --> Score: 0.2022

✅ Best image in 'group_10': best_IMG_20250722_101953933.jpg (Score: 0.2022)

📁 Evaluating folder: group_11
  📷 IMG_20250621_164514759.jpg --> Score: 0.1674
  📷 IMG_20250621_164519122.jpg --> Score: 0.1682

✅ Best image in 'group_11': best_IMG_20250621_164519122.jpg (Score: 0.1682)

📁 Evaluating folder: group_12
  📷 IMG_20250628_204801095.jpg --> Score: 0.1916
  📷 IMG_20250628_204904721.jpg --> Score: 0.2062

✅ Best image in 'group_12': best_IMG_20250628_204904721.jpg (Score: 0.2062)

📁 Evaluating folder: group_13
  📷 IMG_20250621_122059083.jpg --> Score: 0.2060
  📷 IMG_20250621_122059990.jpg --> Score: 0.2154

✅ Best image in 'group_13': best_IMG_20250621_12