In [1]:
# ============================================================
# Duplicate Image Scanner (EXACT bytes hash + VISUAL pHash)
# - Recursively scans ALL subfolders under a dataset root
# - EXACT duplicates: SHA-256 of file bytes (robust to filename)
# - VISUAL duplicates: perceptual hash (pHash) of decoded image
# - Prints a detailed report + saves CSV reports
# - FIX: Always write CSVs with headers (even if empty) 
# ============================================================

import os, sys, hashlib, json
from pathlib import Path

# -----------------------------
# Auto-install dependencies (Jupyter-friendly)
# -----------------------------
def _ensure_packages():
    import importlib, subprocess
    pkgs = [
        ("PIL", "pillow"),
        ("imagehash", "imagehash"),
        ("pandas", "pandas"),
        ("tqdm", "tqdm"),
    ]
    missing = []
    for mod, pip_name in pkgs:
        try:
            importlib.import_module(mod)
        except Exception:
            missing.append(pip_name)
    if missing:
        print("[INFO] Installing missing packages:", missing)
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])

_ensure_packages()

import pandas as pd
from tqdm import tqdm
from PIL import Image
import imagehash

# -----------------------------
# CONFIG (edit these)
# -----------------------------
DATASET_ROOT = r"D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset"
DO_VISUAL = True          # pHash check (slower but catches re-encoded duplicates)
PHASH_BITS = 16           # hash_size for pHash (16 = strong, 8 = faster)
PRINT_MAX_GROUPS = 30     # how many groups to print in console
IGNORE_DIRS = ("dup_reports", ".git", "__pycache__", ".ipynb_checkpoints")
REPORT_DIR = None         # None -> <dataset_root>/dup_reports

# -----------------------------
# Helpers
# -----------------------------
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".gif", ".webp"}

def iter_image_files(root: str, ignore_dirs=None):
    root = Path(root)
    ignore_dirs = set(ignore_dirs or [])
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            if any(part in ignore_dirs for part in p.parts):
                continue
            yield p

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def phash_image(path: Path, hash_size: int = 16) -> str:
    with Image.open(path) as im:
        im = im.convert("RGB")
        return str(imagehash.phash(im, hash_size=hash_size))

def guess_split(dataset_root: Path, file_path: Path):
    rel = file_path.relative_to(dataset_root)
    for part in rel.parts:
        pl = part.lower()
        if pl in {"train", "val", "valid", "validation", "test"}:
            return pl
    return "unknown"

def _group_to_row(group_type: str, key: str, paths: list, dataset_root: Path):
    paths_sorted = sorted([str(p) for p in paths], key=lambda s: (len(s), s))
    canonical = paths_sorted[0]
    dupes = paths_sorted[1:]
    splits = sorted({guess_split(dataset_root, Path(p)) for p in paths_sorted})
    return {
        "group_type": group_type,
        "group_key": key,
        "canonical_path": canonical,
        "duplicate_count": len(dupes),
        "duplicate_paths": ";".join(dupes),
        "all_paths": ";".join(paths_sorted),
        "splits_involved": ",".join(splits),
        "cross_split": int(len(splits) > 1),
    }

def print_groups(title: str, groups: list, max_groups: int = 50):
    print("\n" + "=" * 90)
    print(f"[REPORT] {title}")
    print("=" * 90)
    if not groups:
        print("No groups found.")
        return

    shown = 0
    for i, g in enumerate(groups, start=1):
        print(f"\n[{i}] Canonical: {g['canonical_path']}")
        print(f"    Duplicates found: {g['duplicate_count']}")
        print(f"    Splits involved: {g['splits_involved']} (cross_split={bool(g['cross_split'])})")
        if g["duplicate_count"] > 0:
            for dp in g["duplicate_paths"].split(";"):
                if dp.strip():
                    print(f"      - {dp}")
        shown += 1
        if shown >= max_groups:
            remaining = len(groups) - shown
            if remaining > 0:
                print(f"\n... (showing first {max_groups} groups, {remaining} more not shown)")
            break

# -----------------------------
# Main scanner
# -----------------------------
def run_duplicate_scan(
    dataset_root: str,
    report_dir: str = None,
    do_visual: bool = True,
    phash_bits: int = 16,
    print_max_groups: int = 50,
    ignore_dirs=("dup_reports", ".git", "__pycache__"),
):
    dataset_root = Path(dataset_root)
    assert dataset_root.exists(), f"Path does not exist: {dataset_root}"

    report_dir = Path(report_dir) if report_dir else (dataset_root / "dup_reports")
    report_dir.mkdir(parents=True, exist_ok=True)

    files = list(iter_image_files(dataset_root, ignore_dirs=set(ignore_dirs)))
    print(f"[INFO] Found {len(files):,} image files under: {dataset_root}")

    bad_files = []
    exact_map = {}   # sha256 -> [paths]
    visual_map = {}  # phash  -> [paths]

    # ---- EXACT hashing pass ----
    for p in tqdm(files, desc="Hashing (SHA-256)", unit="img"):
        try:
            h = sha256_file(p)
            exact_map.setdefault(h, []).append(p)
        except Exception as e:
            bad_files.append((str(p), f"sha256_error: {e}"))

    # ---- VISUAL hashing pass ----
    if do_visual:
        for p in tqdm(files, desc=f"Hashing (pHash-{phash_bits})", unit="img"):
            try:
                h = phash_image(p, hash_size=phash_bits)
                visual_map.setdefault(h, []).append(p)
            except Exception as e:
                bad_files.append((str(p), f"phash_error: {e}"))

    # ---- Build groups ----
    exact_groups = []
    for k, ps in exact_map.items():
        if len(ps) >= 2:
            exact_groups.append(_group_to_row("exact_sha256", k, ps, dataset_root))

    visual_groups = []
    if do_visual:
        for k, ps in visual_map.items():
            if len(ps) >= 2:
                visual_groups.append(_group_to_row("visual_phash", k, ps, dataset_root))

    exact_groups.sort(key=lambda d: (-d["duplicate_count"], d["canonical_path"]))
    visual_groups.sort(key=lambda d: (-d["duplicate_count"], d["canonical_path"]))

    # ---- Save CSVs (Option A: ALWAYS include headers) ----
    cols = [
        "group_type",
        "group_key",
        "canonical_path",
        "duplicate_count",
        "duplicate_paths",
        "all_paths",
        "splits_involved",
        "cross_split",
    ]

    exact_csv = report_dir / "duplicates_exact_sha256.csv"
    visual_csv = report_dir / "duplicates_visual_phash.csv"
    combined_csv = report_dir / "duplicates_combined.csv"
    bad_csv = report_dir / "bad_or_unreadable.csv"
    summary_json = report_dir / "summary.json"

    pd.DataFrame(exact_groups, columns=cols).to_csv(exact_csv, index=False)

    if do_visual:
        pd.DataFrame(visual_groups, columns=cols).to_csv(visual_csv, index=False)
        pd.DataFrame(exact_groups + visual_groups, columns=cols).to_csv(combined_csv, index=False)
    else:
        pd.DataFrame(exact_groups, columns=cols).to_csv(combined_csv, index=False)

    pd.DataFrame(bad_files, columns=["path", "error"]).to_csv(bad_csv, index=False)

    # ---- Print reports ----
    print_groups("EXACT duplicates (same bytes / SHA-256)", exact_groups, max_groups=print_max_groups)
    if do_visual:
        print_groups(f"VISUAL duplicates (pHash hash_size={phash_bits})", visual_groups, max_groups=print_max_groups)

    # ---- Summary ----
    exact_dup_groups = len(exact_groups)
    visual_dup_groups = len(visual_groups) if do_visual else 0
    exact_dup_images = sum(g["duplicate_count"] for g in exact_groups)
    visual_dup_images = sum(g["duplicate_count"] for g in visual_groups) if do_visual else 0
    exact_cross = sum(g["cross_split"] for g in exact_groups)
    visual_cross = sum(g["cross_split"] for g in visual_groups) if do_visual else 0

    summary = {
        "dataset_root": str(dataset_root),
        "total_images_scanned": len(files),
        "bad_or_unreadable_images": len(bad_files),
        "exact_duplicate_groups": exact_dup_groups,
        "exact_duplicate_images_excluding_canonicals": int(exact_dup_images),
        "exact_groups_cross_split": int(exact_cross),
        "visual_duplicate_groups": int(visual_dup_groups),
        "visual_duplicate_images_excluding_canonicals": int(visual_dup_images),
        "visual_groups_cross_split": int(visual_cross),
        "reports_dir": str(report_dir),
        "exact_csv": str(exact_csv),
        "visual_csv": str(visual_csv) if do_visual else None,
        "combined_csv": str(combined_csv),
        "bad_csv": str(bad_csv),
    }

    with open(summary_json, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print("\n" + "=" * 90)
    print("[SUMMARY]")
    print(f"Total images scanned: {len(files):,}")
    print(f"Bad/unreadable images: {len(bad_files):,}")
    print(f"Exact duplicate groups: {exact_dup_groups:,} (cross-split groups: {exact_cross:,})")
    if do_visual:
        print(f"Visual duplicate groups: {visual_dup_groups:,} (cross-split groups: {visual_cross:,})")
    print(f"Saved reports to: {report_dir}")
    print("=" * 90)

    return {
        "summary": summary,
        "exact_groups": exact_groups,
        "visual_groups": visual_groups,
        "report_dir": str(report_dir),
        "exact_csv": str(exact_csv),
        "visual_csv": str(visual_csv) if do_visual else None,
        "combined_csv": str(combined_csv),
        "bad_csv": str(bad_csv),
    }

# -----------------------------
# RUN using the config above
# -----------------------------
result = run_duplicate_scan(
    dataset_root=DATASET_ROOT,
    report_dir=REPORT_DIR,
    do_visual=DO_VISUAL,
    phash_bits=PHASH_BITS,
    print_max_groups=PRINT_MAX_GROUPS,
    ignore_dirs=IGNORE_DIRS,
)

# This will no longer crash even when no duplicates exist (CSV still has headers)
print("\n[INFO] Preview:", result["combined_csv"])
print(pd.read_csv(result["combined_csv"]).head(20))


[INFO] Found 24,000 image files under: D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset


Hashing (SHA-256): 100%|███████████████████████████████████████████████████████| 24000/24000 [02:54<00:00, 137.76img/s]
Hashing (pHash-16): 100%|██████████████████████████████████████████████████████| 24000/24000 [01:25<00:00, 281.92img/s]



[REPORT] EXACT duplicates (same bytes / SHA-256)

[1] Canonical: D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset\test\CNV\cnv_test_1002.jpg
    Duplicates found: 1
    Splits involved: test,train (cross_split=True)
      - D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset\train\CNV\cnv_train_2822.jpg

[2] Canonical: D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset\test\CNV\cnv_test_1066.jpg
    Duplicates found: 1
    Splits involved: test,train (cross_split=True)
      - D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset\train\CNV\cnv_train_1105.jpg

[3] Canonical: D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset\test\CNV\cnv_test_1071.jpg
    Duplicates found: 1
    Splits involved: test,train (cross_split=True)
      - D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset\train\CNV\cnv_train_1193.jpg

[4] Canonical: D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset\test\CNV\cnv_test_1073.jpg
    Duplicates found: 1
    Splits involved: test,train (cross_split=True)
      - D:\AIUB\DSP\Code\Datase

In [2]:
# ============================================================
# Duplicate Image Scanner (EXACT bytes hash + VISUAL pHash)
# - Recursively scans ALL subfolders under a dataset root
# - EXACT duplicates: SHA-256 of file bytes (robust to filename)
# - VISUAL duplicates: perceptual hash (pHash) of decoded image
# - Prints a detailed report + saves CSV reports
# - FIX: Always write CSVs with headers (even if empty) 
# ============================================================

import os, sys, hashlib, json
from pathlib import Path

# -----------------------------
# Auto-install dependencies (Jupyter-friendly)
# -----------------------------
def _ensure_packages():
    import importlib, subprocess
    pkgs = [
        ("PIL", "pillow"),
        ("imagehash", "imagehash"),
        ("pandas", "pandas"),
        ("tqdm", "tqdm"),
    ]
    missing = []
    for mod, pip_name in pkgs:
        try:
            importlib.import_module(mod)
        except Exception:
            missing.append(pip_name)
    if missing:
        print("[INFO] Installing missing packages:", missing)
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])

_ensure_packages()

import pandas as pd
from tqdm import tqdm
from PIL import Image
import imagehash

# -----------------------------
# CONFIG (edit these)
# -----------------------------
DATASET_ROOT = r"D:\AIUB\DSP\Code\Datasets\C8\RetinalOCT_Dataset_CLEAN_SHAONLY"
DO_VISUAL = True          # pHash check (slower but catches re-encoded duplicates)
PHASH_BITS = 16           # hash_size for pHash (16 = strong, 8 = faster)
PRINT_MAX_GROUPS = 30     # how many groups to print in console
IGNORE_DIRS = ("dup_reports", ".git", "__pycache__", ".ipynb_checkpoints")
REPORT_DIR = None         # None -> <dataset_root>/dup_reports

# -----------------------------
# Helpers
# -----------------------------
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".gif", ".webp"}

def iter_image_files(root: str, ignore_dirs=None):
    root = Path(root)
    ignore_dirs = set(ignore_dirs or [])
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            if any(part in ignore_dirs for part in p.parts):
                continue
            yield p

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def phash_image(path: Path, hash_size: int = 16) -> str:
    with Image.open(path) as im:
        im = im.convert("RGB")
        return str(imagehash.phash(im, hash_size=hash_size))

def guess_split(dataset_root: Path, file_path: Path):
    rel = file_path.relative_to(dataset_root)
    for part in rel.parts:
        pl = part.lower()
        if pl in {"train", "val", "valid", "validation", "test"}:
            return pl
    return "unknown"

def _group_to_row(group_type: str, key: str, paths: list, dataset_root: Path):
    paths_sorted = sorted([str(p) for p in paths], key=lambda s: (len(s), s))
    canonical = paths_sorted[0]
    dupes = paths_sorted[1:]
    splits = sorted({guess_split(dataset_root, Path(p)) for p in paths_sorted})
    return {
        "group_type": group_type,
        "group_key": key,
        "canonical_path": canonical,
        "duplicate_count": len(dupes),
        "duplicate_paths": ";".join(dupes),
        "all_paths": ";".join(paths_sorted),
        "splits_involved": ",".join(splits),
        "cross_split": int(len(splits) > 1),
    }

def print_groups(title: str, groups: list, max_groups: int = 50):
    print("\n" + "=" * 90)
    print(f"[REPORT] {title}")
    print("=" * 90)
    if not groups:
        print("No groups found.")
        return

    shown = 0
    for i, g in enumerate(groups, start=1):
        print(f"\n[{i}] Canonical: {g['canonical_path']}")
        print(f"    Duplicates found: {g['duplicate_count']}")
        print(f"    Splits involved: {g['splits_involved']} (cross_split={bool(g['cross_split'])})")
        if g["duplicate_count"] > 0:
            for dp in g["duplicate_paths"].split(";"):
                if dp.strip():
                    print(f"      - {dp}")
        shown += 1
        if shown >= max_groups:
            remaining = len(groups) - shown
            if remaining > 0:
                print(f"\n... (showing first {max_groups} groups, {remaining} more not shown)")
            break

# -----------------------------
# Main scanner
# -----------------------------
def run_duplicate_scan(
    dataset_root: str,
    report_dir: str = None,
    do_visual: bool = True,
    phash_bits: int = 16,
    print_max_groups: int = 50,
    ignore_dirs=("dup_reports", ".git", "__pycache__"),
):
    dataset_root = Path(dataset_root)
    assert dataset_root.exists(), f"Path does not exist: {dataset_root}"

    report_dir = Path(report_dir) if report_dir else (dataset_root / "dup_reports")
    report_dir.mkdir(parents=True, exist_ok=True)

    files = list(iter_image_files(dataset_root, ignore_dirs=set(ignore_dirs)))
    print(f"[INFO] Found {len(files):,} image files under: {dataset_root}")

    bad_files = []
    exact_map = {}   # sha256 -> [paths]
    visual_map = {}  # phash  -> [paths]

    # ---- EXACT hashing pass ----
    for p in tqdm(files, desc="Hashing (SHA-256)", unit="img"):
        try:
            h = sha256_file(p)
            exact_map.setdefault(h, []).append(p)
        except Exception as e:
            bad_files.append((str(p), f"sha256_error: {e}"))

    # ---- VISUAL hashing pass ----
    if do_visual:
        for p in tqdm(files, desc=f"Hashing (pHash-{phash_bits})", unit="img"):
            try:
                h = phash_image(p, hash_size=phash_bits)
                visual_map.setdefault(h, []).append(p)
            except Exception as e:
                bad_files.append((str(p), f"phash_error: {e}"))

    # ---- Build groups ----
    exact_groups = []
    for k, ps in exact_map.items():
        if len(ps) >= 2:
            exact_groups.append(_group_to_row("exact_sha256", k, ps, dataset_root))

    visual_groups = []
    if do_visual:
        for k, ps in visual_map.items():
            if len(ps) >= 2:
                visual_groups.append(_group_to_row("visual_phash", k, ps, dataset_root))

    exact_groups.sort(key=lambda d: (-d["duplicate_count"], d["canonical_path"]))
    visual_groups.sort(key=lambda d: (-d["duplicate_count"], d["canonical_path"]))

    # ---- Save CSVs (Option A: ALWAYS include headers) ----
    cols = [
        "group_type",
        "group_key",
        "canonical_path",
        "duplicate_count",
        "duplicate_paths",
        "all_paths",
        "splits_involved",
        "cross_split",
    ]

    exact_csv = report_dir / "duplicates_exact_sha256.csv"
    visual_csv = report_dir / "duplicates_visual_phash.csv"
    combined_csv = report_dir / "duplicates_combined.csv"
    bad_csv = report_dir / "bad_or_unreadable.csv"
    summary_json = report_dir / "summary.json"

    pd.DataFrame(exact_groups, columns=cols).to_csv(exact_csv, index=False)

    if do_visual:
        pd.DataFrame(visual_groups, columns=cols).to_csv(visual_csv, index=False)
        pd.DataFrame(exact_groups + visual_groups, columns=cols).to_csv(combined_csv, index=False)
    else:
        pd.DataFrame(exact_groups, columns=cols).to_csv(combined_csv, index=False)

    pd.DataFrame(bad_files, columns=["path", "error"]).to_csv(bad_csv, index=False)

    # ---- Print reports ----
    print_groups("EXACT duplicates (same bytes / SHA-256)", exact_groups, max_groups=print_max_groups)
    if do_visual:
        print_groups(f"VISUAL duplicates (pHash hash_size={phash_bits})", visual_groups, max_groups=print_max_groups)

    # ---- Summary ----
    exact_dup_groups = len(exact_groups)
    visual_dup_groups = len(visual_groups) if do_visual else 0
    exact_dup_images = sum(g["duplicate_count"] for g in exact_groups)
    visual_dup_images = sum(g["duplicate_count"] for g in visual_groups) if do_visual else 0
    exact_cross = sum(g["cross_split"] for g in exact_groups)
    visual_cross = sum(g["cross_split"] for g in visual_groups) if do_visual else 0

    summary = {
        "dataset_root": str(dataset_root),
        "total_images_scanned": len(files),
        "bad_or_unreadable_images": len(bad_files),
        "exact_duplicate_groups": exact_dup_groups,
        "exact_duplicate_images_excluding_canonicals": int(exact_dup_images),
        "exact_groups_cross_split": int(exact_cross),
        "visual_duplicate_groups": int(visual_dup_groups),
        "visual_duplicate_images_excluding_canonicals": int(visual_dup_images),
        "visual_groups_cross_split": int(visual_cross),
        "reports_dir": str(report_dir),
        "exact_csv": str(exact_csv),
        "visual_csv": str(visual_csv) if do_visual else None,
        "combined_csv": str(combined_csv),
        "bad_csv": str(bad_csv),
    }

    with open(summary_json, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print("\n" + "=" * 90)
    print("[SUMMARY]")
    print(f"Total images scanned: {len(files):,}")
    print(f"Bad/unreadable images: {len(bad_files):,}")
    print(f"Exact duplicate groups: {exact_dup_groups:,} (cross-split groups: {exact_cross:,})")
    if do_visual:
        print(f"Visual duplicate groups: {visual_dup_groups:,} (cross-split groups: {visual_cross:,})")
    print(f"Saved reports to: {report_dir}")
    print("=" * 90)

    return {
        "summary": summary,
        "exact_groups": exact_groups,
        "visual_groups": visual_groups,
        "report_dir": str(report_dir),
        "exact_csv": str(exact_csv),
        "visual_csv": str(visual_csv) if do_visual else None,
        "combined_csv": str(combined_csv),
        "bad_csv": str(bad_csv),
    }

# -----------------------------
# RUN using the config above
# -----------------------------
result = run_duplicate_scan(
    dataset_root=DATASET_ROOT,
    report_dir=REPORT_DIR,
    do_visual=DO_VISUAL,
    phash_bits=PHASH_BITS,
    print_max_groups=PRINT_MAX_GROUPS,
    ignore_dirs=IGNORE_DIRS,
)

# This will no longer crash even when no duplicates exist (CSV still has headers)
print("\n[INFO] Preview:", result["combined_csv"])
print(pd.read_csv(result["combined_csv"]).head(20))


[INFO] Found 1,730 image files under: D:\AIUB\DSP\Code\Datasets\THOCT1800\THOCT1800_CLEAN_SHAONLY


Hashing (SHA-256): 100%|████████████████████████████████████████████████████████| 1730/1730 [00:00<00:00, 1832.14img/s]
Hashing (pHash-16): 100%|████████████████████████████████████████████████████████| 1730/1730 [00:02<00:00, 777.94img/s]


[REPORT] EXACT duplicates (same bytes / SHA-256)
No groups found.

[REPORT] VISUAL duplicates (pHash hash_size=16)
No groups found.

[SUMMARY]
Total images scanned: 1,730
Bad/unreadable images: 0
Exact duplicate groups: 0 (cross-split groups: 0)
Visual duplicate groups: 0 (cross-split groups: 0)
Saved reports to: D:\AIUB\DSP\Code\Datasets\THOCT1800\THOCT1800_CLEAN_SHAONLY\dup_reports

[INFO] Preview: D:\AIUB\DSP\Code\Datasets\THOCT1800\THOCT1800_CLEAN_SHAONLY\dup_reports\duplicates_combined.csv
Empty DataFrame
Columns: [group_type, group_key, canonical_path, duplicate_count, duplicate_paths, all_paths, splits_involved, cross_split]
Index: []



