In [32]:
from types import SimpleNamespace

CFG = SimpleNamespace(
    ROOT=".",
    APPLY=False,
    WRITE_JSON=False,
    FIELDS=[
        {"field": "art_music",   "json": "art_music_with_bios_imaged_crawled.json",          "imgdir": "art_music_images"},
        {"field": "athlete",     "json": "athlete_merged_with_bios_imaged_crawled.json",     "imgdir": "athlete_images"},
        {"field": "poets",       "json": "poets_merged_with_bios_imaged_crawled.json",       "imgdir": "poets_images"},
        {"field": "scientists",  "json": "scientists_merged_with_bios_imaged_crawled.json",  "imgdir": "scientists_images"},
    ],
)

In [33]:
import json, re, sys, unicodedata
from pathlib import Path
from collections import defaultdict
import sys

In [34]:
SRC_PATTERN = re.compile(
    r"^(?P<idx>\d+?)_(?P<safe>.+?)_(?P<tag>image|crawled)(?:_[0-9]+)?\.(?P<ext>jpe?g|png|webp|gif|bmp|tiff)$",
    re.IGNORECASE
)

In [35]:
def sanitize_filename(s: str) -> str:
    # same logic you used to create safe_name
    s = (str(s) or "").strip()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^\w\-.]+", "", s)
    return s or "image"

def coerce_to_text(x):
    # records may have name as str | list | dict
    if isinstance(x, str):
        return x
    if isinstance(x, list):
        cands = [str(y) for y in x if isinstance(y, (str, int, float)) and str(y).strip()]
        return max(cands, key=len) if cands else ""
    if isinstance(x, dict):
        for k in ("name","fa","fa_IR","en","value","text","title"):
            v = x.get(k)
            if isinstance(v, str) and v.strip():
                return v
        vals = [str(v) for v in x.values() if isinstance(v, (str,int,float)) and str(v).strip()]
        return " ".join(vals)
    return "" if x is None else str(x)

def normalize_text(s: str) -> str:
    return unicodedata.normalize("NFC", coerce_to_text(s)).strip()

def safeish_human_name(name: str) -> str:
    """Keep human name for filename, remove path-unsafe chars (allow spaces & non-ASCII)."""
    name = normalize_text(name)
    return re.sub(r'[<>:"/\\|?*\u0000-\u001F]', "", name)

def load_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def build_record_maps(records):
    """
    Returns:
      by_index: dict[int] -> record (if 'index' present and parseable)
      by_safe:  dict[str] -> list[record] where key = sanitize(lower(name))
    """
    by_index = {}
    by_safe = defaultdict(list)
    iterable = records if isinstance(records, list) else records.values()

    for rec in iterable:
        name_raw = rec.get("name") or rec.get("Name") or rec.get("title") or ""
        norm_name = normalize_text(name_raw)
        safe = sanitize_filename(norm_name).lower()

        idx_val = rec.get("index", rec.get("idx", rec.get("id")))
        if idx_val is not None:
            try:
                by_index[int(str(idx_val))] = rec
            except:
                pass

        by_safe[safe].append(rec)

    return by_index, by_safe

def pick_record(idx, safe_key, by_index, by_safe):
    """Prefer index match; fallback to unique safe_name match."""
    if idx in by_index:
        return by_index[idx]
    recs = by_safe.get(safe_key, [])
    return recs[0] if len(recs) == 1 else None

def parse_source_files(imgdir: Path):
    out = []
    for p in imgdir.glob("*"):
        if not p.is_file():
            continue
        m = SRC_PATTERN.match(p.name)
        if not m:
            continue
        idx = int(m.group("idx"))
        safe = m.group("safe").lower()
        ext = m.group("ext").lower()
        out.append((idx, safe, ext, p))
    return out

def plan_renames(imgdir: Path, records):
    """
    Build rename plan (src -> dst) with desired format: {rep}_{idx}_{human}.{ext}
    Group images by (idx, safe) and enumerate rep = 0..N-1 deterministically.
    """
    by_index, by_safe = build_record_maps(records)
    files = parse_source_files(imgdir)

    # group by (idx, safe)
    buckets = defaultdict(list)
    for idx, safe, ext, p in files:
        buckets[(idx, safe)].append((ext, p))

    plan = []
    for (idx, safe), items in buckets.items():
        rec = pick_record(idx, safe, by_index, by_safe)
        if rec is None:
            # fallback: reconstruct human name from safe token
            candidate_name = safe.replace("_", " ")
        else:
            candidate_name = rec.get("name", safe.replace("_", " "))

        human = safeish_human_name(candidate_name)
        items_sorted = sorted(items, key=lambda x: x[1].name)

        for rep, (ext, p) in enumerate(items_sorted):
            # NOTE: format requested: rep_idx_human.ext
            new_name = f"{rep}_{idx}_{human}.{ext}"
            plan.append((p, imgdir / new_name))

    return plan

def ensure_no_collisions(plan):
    """
    If multiple sources map to identical targets (or file already exists), append _dupN.
    """
    adjusted = []
    used = set()
    for src, dst in plan:
        cand = dst
        stem, ext = cand.stem, cand.suffix
        n = 1
        while (cand.parent, cand.name) in used or cand.exists():
            cand = cand.with_name(f"{stem}_dup{n}{ext}")
            n += 1
        used.add((cand.parent, cand.name))
        adjusted.append((src, cand))
    return adjusted

def update_sidecar_json(json_path: Path, records, local_map):
    """
    Write a sidecar JSON with 'local_images' per record, keyed by (human, index).
    Expects keys in local_map like: f"{human}_{idx_str}" (from rep_idx_human).
    """
    recs = records if isinstance(records, list) else list(records.values())
    by_key = {}
    for rec in recs:
        name = normalize_text(rec.get("name") or "")
        idx = rec.get("index")
        if idx is not None:
            by_key[f"{safeish_human_name(name)}_{int(idx)}"] = rec

    for key, imgs in local_map.items():
        if key in by_key:
            by_key[key]["local_images"] = sorted(imgs)

    out_json = json_path.with_name(json_path.stem + "_with_local_images.json")
    with out_json.open("w", encoding="utf-8") as f:
        json.dump(recs, f, ensure_ascii=False, indent=2)
    print(f"Sidecar written: {out_json.name}")

In [36]:
root = Path(CFG.ROOT)
total_changes = 0

for entry in CFG.FIELDS:
    json_path = root / entry["json"]
    imgdir    = root / entry["imgdir"]

    if not json_path.exists():
        print(f"[WARN] JSON not found: {json_path}", file=sys.stderr)
        continue
    if not imgdir.exists():
        print(f"[WARN] Image folder not found: {imgdir}", file=sys.stderr)
        continue

    print(f"\n=== Field: {entry['field']} ===")
    records = load_json(json_path)

    # plan & sanitize
    plan = plan_renames(imgdir, records)
    plan = ensure_no_collisions(plan)

    if not plan:
        print("No matching files to rename.")
        continue

    local_map = defaultdict(list)

    for src, dst in plan:
        print(f"{'' if CFG.APPLY else '[DRY] '}{src.name}  ->  {dst.name}")
        total_changes += 1
        if CFG.APPLY:
            dst.parent.mkdir(parents=True, exist_ok=True)
            src.rename(dst)

        # Parse rep_idx_human for sidecar key (human may contain underscores)
        try:
            parts = dst.stem.split("_", 2)   # [rep, idx, human...]
            idx_str = parts[1] if len(parts) > 1 else ""
            human   = parts[2] if len(parts) > 2 else ""
            key = f"{human}_{idx_str}"
            local_map[key].append(dst.name)
        except Exception:
            pass

    if CFG.WRITE_JSON:
        try:
            update_sidecar_json(json_path, records, local_map)
        except Exception as e:
            print(f"[WARN] Could not write sidecar for {json_path.name}: {e}", file=sys.stderr)

print(f"\nPlanned changes: {total_changes}")
if not CFG.APPLY:
    print("Dry run complete. Set CFG.APPLY=True to perform the renames.")


=== Field: art_music ===
[DRY] 1006_معصومه_ددهبالا_crawled.jpg  ->  0_1006_معصومه ددهبالا.jpg
[DRY] 1008_مرتضی_پاشایی_crawled.jpg  ->  0_1008_مرتضی پاشایی.jpg
[DRY] 1018_امید_سلطانی_crawled.jpg  ->  0_1018_امید سلطانی.jpg
[DRY] 1038_نعمتالله_آزموده_crawled.jpg  ->  0_1038_نعمت‌الله آزموده.jpg
[DRY] 1040_سید_بهنام_صفوی_crawled.jpg  ->  0_1040_سید بهنام صفوی.jpg
[DRY] 1074_داوود_شعبانی_نصر_crawled.jpg  ->  0_1074_داوود شعبانی نصر.jpg
[DRY] 10_نورمحمد_ذوالفقاری_crawled.jpg  ->  0_10_نورمحمد ذوالفقاری.jpg
[DRY] 1118_گوهر_خیراندیش_crawled.jpg  ->  0_1118_گوهر خیراندیش.jpg
[DRY] 114_هادی_حجازیفر_crawled.jpg  ->  0_114_هادی حجازیفر.jpg
[DRY] 1174_یدالله_کابلی_خوانساری_crawled.jpg  ->  0_1174_یدالله کابلی خوانساری.jpg
[DRY] 1217_روحالله_خالقی_crawled.jpg  ->  0_1217_روح‌الله خالقی.jpg
[DRY] 1219_محمود_ذوالفنون_crawled.jpg  ->  0_1219_محمود ذوالفنون.jpg
[DRY] 1240_محمدعلی_زرندی_crawled.jpg  ->  0_1240_محمدعلی زرندی.jpg
[DRY] 1252_مهین_نویدی_crawled.jpg  ->  0_1252_مهین نویدی.jpg
[DRY] 1276_ایر

In [56]:
CFG = SimpleNamespace(
    ROOT=".",
    APPLY=True,
    WRITE_JSON=False,
    FIELDS=[
        {"field": "art_music",   "json": "art_music_with_bios_imaged_crawled.json",          "imgdir": "art_music_images"},
        {"field": "athlete",     "json": "athlete_merged_with_bios_imaged_crawled.json",     "imgdir": "athlete_images"},
        {"field": "poets",       "json": "poets_merged_with_bios_imaged_crawled.json",       "imgdir": "poets_images"},
        {"field": "scientists",  "json": "scientists_merged_with_bios_imaged_crawled.json",  "imgdir": "scientists_images"},
    ],
)

In [55]:
root = Path(CFG.ROOT)
total_changes = 0

for entry in CFG.FIELDS:
    json_path = root / entry["json"]
    imgdir    = root / entry["imgdir"]

    if not json_path.exists():
        print(f"[WARN] JSON not found: {json_path}", file=sys.stderr)
        continue
    if not imgdir.exists():
        print(f"[WARN] Image folder not found: {imgdir}", file=sys.stderr)
        continue

    print(f"\n=== Field: {entry['field']} ===")
    records = load_json(json_path)

    # plan & sanitize
    plan = plan_renames(imgdir, records)
    plan = ensure_no_collisions(plan)

    if not plan:
        print("No matching files to rename.")
        continue

    local_map = defaultdict(list)

    for src, dst in plan:
        print(f"{'' if CFG.APPLY else '[DRY] '}{src.name}  ->  {dst.name}")
        total_changes += 1
        if CFG.APPLY:
            dst.parent.mkdir(parents=True, exist_ok=True)
            src.rename(dst)

        # Parse rep_idx_human for sidecar key (human may contain underscores)
        try:
            parts = dst.stem.split("_", 2)   # [rep, idx, human...]
            idx_str = parts[1] if len(parts) > 1 else ""
            human   = parts[2] if len(parts) > 2 else ""
            key = f"{human}_{idx_str}"
            local_map[key].append(dst.name)
        except Exception:
            pass

    if CFG.WRITE_JSON:
        try:
            update_sidecar_json(json_path, records, local_map)
        except Exception as e:
            print(f"[WARN] Could not write sidecar for {json_path.name}: {e}", file=sys.stderr)

print(f"\nPlanned changes: {total_changes}")
if not CFG.APPLY:
    print("Dry run complete. Set CFG.APPLY=True to perform the renames.")


=== Field: art_music ===
No matching files to rename.

=== Field: athlete ===


KeyboardInterrupt: 

In [57]:
# Accept old names: {idx}_{safe}_{any-tag}[_N].{ext}
SRC_PATTERN = re.compile(
    r"^(?P<idx>\d+?)_(?P<safe>.+?)_(?P<tag>[^._]+?)(?:_(?P<src_rep>\d+))?\.(?P<ext>jpe?g|png|webp|gif|bmp|tiff)$",
    re.IGNORECASE
)

# Detect already-correct names: {rep}_{idx}_{human}.{ext}
TARGET_PATTERN = re.compile(
    r"^(?P<rep>\d+?)_(?P<idx>\d+?)_(?P<human>.+?)\.(?P<ext>jpe?g|png|webp|gif|bmp|tiff)$",
    re.IGNORECASE
)

In [58]:
CFG = SimpleNamespace(
    ROOT=".",
    APPLY=False,
    WRITE_JSON=False,
    FIELDS=[
        {"field": "art_music",   "json": "art_music_with_bios_imaged_crawled.json",          "imgdir": "art_music_images"},
        {"field": "athlete",     "json": "athlete_merged_with_bios_imaged_crawled.json",     "imgdir": "athlete_images"},
        {"field": "poets",       "json": "poets_merged_with_bios_imaged_crawled.json",       "imgdir": "poets_images"},
        {"field": "scientists",  "json": "scientists_merged_with_bios_imaged_crawled.json",  "imgdir": "scientists_images"},
    ],
)

In [59]:
def parse_source_files(imgdir: Path):
    out = []
    for p in imgdir.glob("*"):
        if not p.is_file():
            continue
        # ✅ already correct? skip
        if TARGET_PATTERN.match(p.name):
            continue
        m = SRC_PATTERN.match(p.name)
        if not m:
            continue
        idx = int(m.group("idx"))
        safe = m.group("safe").lower()
        ext = m.group("ext").lower()
        out.append((idx, safe, ext, p))
    return out

def existing_max_rep(imgdir: Path, idx: int, human: str) -> int:
    """
    Find the highest rep already present for the same (idx, human) in target format.
    Returns -1 if none exist.
    """
    target_human_norm = normalize_text(human)
    max_rep = -1
    for p in imgdir.glob("*"):
        m = TARGET_PATTERN.match(p.name)
        if not m:
            continue
        if int(m.group("idx")) != idx:
            continue
        # compare normalized human names
        if normalize_text(m.group("human")) != target_human_norm:
            continue
        try:
            r = int(m.group("rep"))
            if r > max_rep:
                max_rep = r
        except:
            pass
    return max_rep

In [60]:
def plan_renames(imgdir: Path, records):
    """
    Build rename plan (src -> dst) with desired format: {rep}_{idx}_{human}.{ext}
    - Only act on old-style names (idx_safe_tag[_N].ext)
    - Skip files already in target format
    - Continue rep after existing files for the same (idx, human)
    """
    by_index, by_safe = build_record_maps(records)
    files = parse_source_files(imgdir)  # old-style only

    # group by (idx, safe)
    buckets = defaultdict(list)
    for idx, safe, ext, p in files:
        buckets[(idx, safe)].append((ext, p))

    plan = []
    for (idx, safe), items in buckets.items():
        rec = pick_record(idx, safe, by_index, by_safe)
        if rec is None:
            candidate_name = safe.replace("_", " ")
        else:
            candidate_name = rec.get("name", safe.replace("_", " "))

        human = safeish_human_name(candidate_name)

        # ✅ Start from the next available rep (avoid _dup and overwrites)
        rep_start = existing_max_rep(imgdir, idx, human) + 1

        items_sorted = sorted(items, key=lambda x: x[1].name)
        for i, (ext, p) in enumerate(items_sorted):
            rep = rep_start + i
            new_name = f"{rep}_{idx}_{human}.{ext}"
            plan.append((p, imgdir / new_name))

    return plan

In [61]:
root = Path(CFG.ROOT)
total_changes = 0

for entry in CFG.FIELDS:
    json_path = root / entry["json"]
    imgdir    = root / entry["imgdir"]

    if not json_path.exists():
        print(f"[WARN] JSON not found: {json_path}", file=sys.stderr)
        continue
    if not imgdir.exists():
        print(f"[WARN] Image folder not found: {imgdir}", file=sys.stderr)
        continue

    print(f"\n=== Field: {entry['field']} ===")
    records = load_json(json_path)

    # plan & sanitize
    plan = plan_renames(imgdir, records)
    plan = ensure_no_collisions(plan)

    if not plan:
        print("No matching files to rename.")
        continue

    local_map = defaultdict(list)

    for src, dst in plan:
        print(f"{'' if CFG.APPLY else '[DRY] '}{src.name}  ->  {dst.name}")
        total_changes += 1
        if CFG.APPLY:
            dst.parent.mkdir(parents=True, exist_ok=True)
            src.rename(dst)

        # Parse rep_idx_human for sidecar key (human may contain underscores)
        try:
            parts = dst.stem.split("_", 2)   # [rep, idx, human...]
            idx_str = parts[1] if len(parts) > 1 else ""
            human   = parts[2] if len(parts) > 2 else ""
            key = f"{human}_{idx_str}"
            local_map[key].append(dst.name)
        except Exception:
            pass

    if CFG.WRITE_JSON:
        try:
            update_sidecar_json(json_path, records, local_map)
        except Exception as e:
            print(f"[WARN] Could not write sidecar for {json_path.name}: {e}", file=sys.stderr)

print(f"\nPlanned changes: {total_changes}")
if not CFG.APPLY:
    print("Dry run complete. Set CFG.APPLY=True to perform the renames.")


=== Field: art_music ===
No matching files to rename.

=== Field: athlete ===
[DRY] 1000_رضا_میرباقری_crawled.jpg  ->  0_1000_سید علی حق‌شناس کمیاب.jpg
[DRY] 1001_رضا_میرزایی_image.jpg  ->  0_1001_سهراب سرابی.jpg
[DRY] 1004_رضا_نوروزی_image.jpg  ->  0_1004_رضا نوروزی.jpg
[DRY] 1007_رضا_کرمانشاهی_image.jpg  ->  0_1007_رضا کرمانشاهی.jpg
[DRY] 1008_رضا_کریمی_crawled.jpg  ->  0_1008_عباس محمدرضایی.jpg
[DRY] 1009_رضا_کلهر_crawled.jpg  ->  0_1009_رضا کلهر.jpg
[DRY] 100_ابراهیم_قاسمپور_image.jpg  ->  0_100_آندرانیک اسکندریان.jpg
[DRY] 1011_رضا_یزدانی_image.jpg  ->  0_1011_رضا یزدانی.jpg
[DRY] 1012_رضا_یزداندوست_image.jpg  ->  0_1012_لی اروین.jpg
[DRY] 1015_رمضان_خدر_image.jpg  ->  0_1015_بشیر باباجان‌زاده.jpg
[DRY] 1016_رمضان_صالحی_crawled.jpg  ->  0_1016_رمضان صالحی.jpg
[DRY] 1018_رناتو_سیلویرا_crawled.jpg  ->  0_1018_رناتو سیلویرا.jpg
[DRY] 1019_روبرت_ساها_image.jpg  ->  0_1019_فرانک اوفارل.jpg
[DRY] 1020_روبرتو_تورس_crawled.jpg  ->  0_1020_روبرتو تورس.jpg
[DRY] 1021_روحی_پندنواز_image.jpg

In [63]:
def record_safe(rec) -> str:
    name_raw = rec.get("name") or rec.get("Name") or rec.get("title") or ""
    return sanitize_filename(normalize_text(name_raw)).lower()

def pick_record(idx, safe_key, by_index, by_safe):
    """
    Prefer a unique safe-name match. If multiple, try to disambiguate by index.
    Only accept by-index if its safe-name also matches the filename's safe_key.
    """
    # 1) unique safe-name match
    recs = by_safe.get(safe_key, [])
    if len(recs) == 1:
        return recs[0]

    # 2) try index record, but only if its safe matches safe_key
    rec_idx = by_index.get(idx)
    if rec_idx and record_safe(rec_idx) == safe_key:
        return rec_idx

    # 3) multiple safe-name candidates: pick the one whose index equals idx
    for r in recs:
        idx_val = r.get("index") or r.get("idx") or r.get("id")
        try:
            if int(str(idx_val)) == idx:
                return r
        except:
            pass

    # 4) no trustworthy JSON match
    return None

In [64]:
def plan_renames(imgdir: Path, records):
    """
    Build rename plan (src -> dst) with desired format: {rep}_{idx}_{human}.{ext}
    - Only act on old-style names (idx_safe_tag[_N].ext) — your scanner already does that
    - Skip files already in target format elsewhere
    - Continue rep after existing files for the same (idx, human)
    """
    by_index, by_safe = build_record_maps(records)
    files = parse_source_files(imgdir)  # old-style only

    # group by (idx, safe)
    buckets = defaultdict(list)
    for idx, safe, ext, p in files:
        buckets[(idx, safe)].append((ext, p))

    plan = []
    for (idx, safe), items in buckets.items():
        rec = pick_record(idx, safe, by_index, by_safe)

        if rec is not None and record_safe(rec) == safe:
            # JSON agrees with the filename safe key → trust JSON display name
            candidate_name = rec.get("name", safe.replace("_", " "))
        else:
            # JSON absent/ambiguous/mismatched → keep the filename's own safe-name
            candidate_name = safe.replace("_", " ")

        human = safeish_human_name(candidate_name)

        # Start from next available rep (avoid _dupN), using your existing helper
        rep_start = existing_max_rep(imgdir, idx, human) + 1

        items_sorted = sorted(items, key=lambda x: x[1].name)
        for i, (ext, p) in enumerate(items_sorted):
            rep = rep_start + i
            new_name = f"{rep}_{idx}_{human}.{ext}"
            plan.append((p, imgdir / new_name))

    return plan

In [65]:
root = Path(CFG.ROOT)
total_changes = 0

for entry in CFG.FIELDS:
    json_path = root / entry["json"]
    imgdir    = root / entry["imgdir"]

    if not json_path.exists():
        print(f"[WARN] JSON not found: {json_path}", file=sys.stderr)
        continue
    if not imgdir.exists():
        print(f"[WARN] Image folder not found: {imgdir}", file=sys.stderr)
        continue

    print(f"\n=== Field: {entry['field']} ===")
    records = load_json(json_path)

    # plan & sanitize
    plan = plan_renames(imgdir, records)
    plan = ensure_no_collisions(plan)

    if not plan:
        print("No matching files to rename.")
        continue

    local_map = defaultdict(list)

    for src, dst in plan:
        print(f"{'' if CFG.APPLY else '[DRY] '}{src.name}  ->  {dst.name}")
        total_changes += 1
        if CFG.APPLY:
            dst.parent.mkdir(parents=True, exist_ok=True)
            src.rename(dst)

        # Parse rep_idx_human for sidecar key (human may contain underscores)
        try:
            parts = dst.stem.split("_", 2)   # [rep, idx, human...]
            idx_str = parts[1] if len(parts) > 1 else ""
            human   = parts[2] if len(parts) > 2 else ""
            key = f"{human}_{idx_str}"
            local_map[key].append(dst.name)
        except Exception:
            pass

    if CFG.WRITE_JSON:
        try:
            update_sidecar_json(json_path, records, local_map)
        except Exception as e:
            print(f"[WARN] Could not write sidecar for {json_path.name}: {e}", file=sys.stderr)

print(f"\nPlanned changes: {total_changes}")
if not CFG.APPLY:
    print("Dry run complete. Set CFG.APPLY=True to perform the renames.")


=== Field: art_music ===
No matching files to rename.

=== Field: athlete ===
[DRY] 1000_رضا_میرباقری_crawled.jpg  ->  0_1000_رضا میرباقری.jpg
[DRY] 1001_رضا_میرزایی_image.jpg  ->  0_1001_رضا میرزایی.jpg
[DRY] 1004_رضا_نوروزی_image.jpg  ->  0_1004_رضا نوروزی.jpg
[DRY] 1007_رضا_کرمانشاهی_image.jpg  ->  0_1007_رضا کرمانشاهی.jpg
[DRY] 1008_رضا_کریمی_crawled.jpg  ->  0_1008_رضا کریمی.jpg
[DRY] 1009_رضا_کلهر_crawled.jpg  ->  0_1009_رضا کلهر.jpg
[DRY] 100_ابراهیم_قاسمپور_image.jpg  ->  0_100_ابراهیم قاسم‌پور.jpg
[DRY] 1011_رضا_یزدانی_image.jpg  ->  0_1011_رضا یزدانی.jpg
[DRY] 1012_رضا_یزداندوست_image.jpg  ->  0_1012_رضا یزدان‌دوست.jpg
[DRY] 1015_رمضان_خدر_image.jpg  ->  0_1015_رمضان خدر.jpg
[DRY] 1016_رمضان_صالحی_crawled.jpg  ->  0_1016_رمضان صالحی.jpg
[DRY] 1018_رناتو_سیلویرا_crawled.jpg  ->  0_1018_رناتو سیلویرا.jpg
[DRY] 1019_روبرت_ساها_image.jpg  ->  0_1019_روبرت ساها.jpg
[DRY] 1020_روبرتو_تورس_crawled.jpg  ->  0_1020_روبرتو تورس.jpg
[DRY] 1021_روحی_پندنواز_image.jpg  ->  0_1021_روحی پن

In [66]:
CFG = SimpleNamespace(
    ROOT=".",
    APPLY=True,
    WRITE_JSON=False,
    FIELDS=[
        {"field": "art_music",   "json": "art_music_with_bios_imaged_crawled.json",          "imgdir": "art_music_images"},
        {"field": "athlete",     "json": "athlete_merged_with_bios_imaged_crawled.json",     "imgdir": "athlete_images"},
        {"field": "poets",       "json": "poets_merged_with_bios_imaged_crawled.json",       "imgdir": "poets_images"},
        {"field": "scientists",  "json": "scientists_merged_with_bios_imaged_crawled.json",  "imgdir": "scientists_images"},
    ],
)

In [67]:
root = Path(CFG.ROOT)
total_changes = 0

for entry in CFG.FIELDS:
    json_path = root / entry["json"]
    imgdir    = root / entry["imgdir"]

    if not json_path.exists():
        print(f"[WARN] JSON not found: {json_path}", file=sys.stderr)
        continue
    if not imgdir.exists():
        print(f"[WARN] Image folder not found: {imgdir}", file=sys.stderr)
        continue

    print(f"\n=== Field: {entry['field']} ===")
    records = load_json(json_path)

    # plan & sanitize
    plan = plan_renames(imgdir, records)
    plan = ensure_no_collisions(plan)

    if not plan:
        print("No matching files to rename.")
        continue

    local_map = defaultdict(list)

    for src, dst in plan:
        print(f"{'' if CFG.APPLY else '[DRY] '}{src.name}  ->  {dst.name}")
        total_changes += 1
        if CFG.APPLY:
            dst.parent.mkdir(parents=True, exist_ok=True)
            src.rename(dst)

        # Parse rep_idx_human for sidecar key (human may contain underscores)
        try:
            parts = dst.stem.split("_", 2)   # [rep, idx, human...]
            idx_str = parts[1] if len(parts) > 1 else ""
            human   = parts[2] if len(parts) > 2 else ""
            key = f"{human}_{idx_str}"
            local_map[key].append(dst.name)
        except Exception:
            pass

    if CFG.WRITE_JSON:
        try:
            update_sidecar_json(json_path, records, local_map)
        except Exception as e:
            print(f"[WARN] Could not write sidecar for {json_path.name}: {e}", file=sys.stderr)

print(f"\nPlanned changes: {total_changes}")
if not CFG.APPLY:
    print("Dry run complete. Set CFG.APPLY=True to perform the renames.")


=== Field: art_music ===
No matching files to rename.

=== Field: athlete ===
1000_رضا_میرباقری_crawled.jpg  ->  0_1000_رضا میرباقری.jpg
1001_رضا_میرزایی_image.jpg  ->  0_1001_رضا میرزایی.jpg
1004_رضا_نوروزی_image.jpg  ->  0_1004_رضا نوروزی.jpg
1007_رضا_کرمانشاهی_image.jpg  ->  0_1007_رضا کرمانشاهی.jpg
1008_رضا_کریمی_crawled.jpg  ->  0_1008_رضا کریمی.jpg
1009_رضا_کلهر_crawled.jpg  ->  0_1009_رضا کلهر.jpg
100_ابراهیم_قاسمپور_image.jpg  ->  0_100_ابراهیم قاسم‌پور.jpg
1011_رضا_یزدانی_image.jpg  ->  0_1011_رضا یزدانی.jpg
1012_رضا_یزداندوست_image.jpg  ->  0_1012_رضا یزدان‌دوست.jpg
1015_رمضان_خدر_image.jpg  ->  0_1015_رمضان خدر.jpg
1016_رمضان_صالحی_crawled.jpg  ->  0_1016_رمضان صالحی.jpg
1018_رناتو_سیلویرا_crawled.jpg  ->  0_1018_رناتو سیلویرا.jpg
1019_روبرت_ساها_image.jpg  ->  0_1019_روبرت ساها.jpg
1020_روبرتو_تورس_crawled.jpg  ->  0_1020_روبرتو تورس.jpg
1021_روحی_پندنواز_image.jpg  ->  0_1021_روحی پندنواز.jpg
1022_روح_اله_سیف_اللهی_مقدم_image1.jpg  ->  0_1022_روح‌ اله سیف‌ اللهی مقدم.jpg


In [68]:
root = Path(CFG.ROOT)
total_changes = 0

for entry in CFG.FIELDS:
    json_path = root / entry["json"]
    imgdir    = root / entry["imgdir"]

    if not json_path.exists():
        print(f"[WARN] JSON not found: {json_path}", file=sys.stderr)
        continue
    if not imgdir.exists():
        print(f"[WARN] Image folder not found: {imgdir}", file=sys.stderr)
        continue

    print(f"\n=== Field: {entry['field']} ===")
    records = load_json(json_path)

    # plan & sanitize
    plan = plan_renames(imgdir, records)
    plan = ensure_no_collisions(plan)

    if not plan:
        print("No matching files to rename.")
        continue

    local_map = defaultdict(list)

    for src, dst in plan:
        print(f"{'' if CFG.APPLY else '[DRY] '}{src.name}  ->  {dst.name}")
        total_changes += 1
        if CFG.APPLY:
            dst.parent.mkdir(parents=True, exist_ok=True)
            src.rename(dst)

        # Parse rep_idx_human for sidecar key (human may contain underscores)
        try:
            parts = dst.stem.split("_", 2)   # [rep, idx, human...]
            idx_str = parts[1] if len(parts) > 1 else ""
            human   = parts[2] if len(parts) > 2 else ""
            key = f"{human}_{idx_str}"
            local_map[key].append(dst.name)
        except Exception:
            pass

    if CFG.WRITE_JSON:
        try:
            update_sidecar_json(json_path, records, local_map)
        except Exception as e:
            print(f"[WARN] Could not write sidecar for {json_path.name}: {e}", file=sys.stderr)

print(f"\nPlanned changes: {total_changes}")
if not CFG.APPLY:
    print("Dry run complete. Set CFG.APPLY=True to perform the renames.")


=== Field: art_music ===
No matching files to rename.

=== Field: athlete ===
No matching files to rename.

=== Field: poets ===
No matching files to rename.

=== Field: scientists ===
No matching files to rename.

Planned changes: 0


# Add summary to politicians json

In [69]:
import json, csv, re, unicodedata
from pathlib import Path
from collections import defaultdict

In [70]:
POL_JSON_IN  = Path("politicians_main.json")
CSV_PATH     = Path("politicians_image_text_pairs.csv")
POL_JSON_OUT = Path("politicians_main_with_summaries.json")

In [71]:
def sanitize_filename(s: str) -> str:
    s = (str(s) or "").strip()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^\w\-.]+", "", s)
    return s or "image"

def coerce_to_text(x):
    if isinstance(x, str):
        return x
    if isinstance(x, list):
        cands = [str(y) for y in x if isinstance(y, (str, int, float)) and str(y).strip()]
        return max(cands, key=len) if cands else ""
    if isinstance(x, dict):
        for k in ("name","fa","fa_IR","en","value","text","title"):
            v = x.get(k)
            if isinstance(v, str) and v.strip():
                return v
        vals = [str(v) for v in x.values() if isinstance(v, (str,int,float)) and str(v).strip()]
        return " ".join(vals)
    return "" if x is None else str(x)

def normalize_text(s: str) -> str:
    return unicodedata.normalize("NFC", coerce_to_text(s)).strip()

def name_key(s: str) -> str:
    # normalize + collapse spaces/underscores so CSV/JSON match reliably
    t = normalize_text(s)
    t = re.sub(r"[\s_]+", " ", t)
    return t

def clean_summary(s: str) -> str:
    s = coerce_to_text(s).strip()
    # strip leading/trailing quotes (1 or many)
    s = re.sub(r'^(["\']+)|(["\']+)$', "", s)
    # drop trailing "None" (optionally preceded by ,/،/space)
    s = re.sub(r'(?:[,،؛]?\s*)?None\s*$', "", s, flags=re.IGNORECASE).strip()
    return s

In [72]:
IMG_RE_NAME_FIRST = re.compile(r'^(?P<name>.+)_(?P<idx>\d+)_(?P<rep>\d+)\.[^.]+$')
IMG_RE_REP_FIRST  = re.compile(r'^(?P<rep>\d+)_(?P<idx>\d+)_(?P<name>.+)\.[^.]+$')

def parse_image_record(image_path: str):
    base = Path(coerce_to_text(image_path)).name
    m = IMG_RE_NAME_FIRST.match(base) or IMG_RE_REP_FIRST.match(base)
    if m:
        return m.group("name"), int(m.group("idx")), int(m.group("rep"))
    # fallback: take last two numeric chunks as idx/rep
    stem = Path(base).stem
    parts = stem.split("_")
    if len(parts) >= 3 and parts[-1].isdigit() and parts[-2].isdigit():
        name = "_".join(parts[:-2])
        return name, int(parts[-2]), int(parts[-1])
    # last resort: return name only
    return Path(base).stem, None, None

In [73]:
summary_map = defaultdict(list)
with CSV_PATH.open("r", encoding="utf-8") as f:
    rdr = csv.DictReader(f)
    for row in rdr:
        img = row.get("image_path", "")
        txt = row.get("text", "")
        person_name_raw, idx, rep = parse_image_record(img)
        k = name_key(person_name_raw)
        s = clean_summary(txt)
        if s:
            summary_map[k].append(s)

In [74]:
best_summary = {k: max(vs, key=len) for k, vs in summary_map.items() if vs}

print(f"Loaded {len(best_summary)} names with summaries from CSV.")

Loaded 980 names with summaries from CSV.


In [75]:
data = json.loads(POL_JSON_IN.read_text(encoding="utf-8"))
records = data if isinstance(data, list) else list(data.values())

In [76]:
updated = 0
added   = 0
not_found = []

for rec in records:
    rec_name = coerce_to_text(rec.get("name", ""))
    k = name_key(rec_name)
    new_sum = best_summary.get(k)
    if not new_sum:
        not_found.append(rec_name)
        continue

    old_sum = clean_summary(rec.get("summary", ""))
    if not old_sum:
        rec["summary"] = new_sum
        added += 1
    elif len(new_sum) > len(old_sum):
        rec["summary"] = new_sum
        updated += 1

In [77]:
if isinstance(data, list):
    out_data = records
else:
    # reconstruct dict if input was dict-like
    # try to preserve original keys if possible by matching names
    name_to_rec = {name_key(coerce_to_text(r.get("name",""))): r for r in records}
    out_data = {}
    for k, v in data.items():
        nk = name_key(coerce_to_text(v.get("name","")))
        out_data[k] = name_to_rec.get(nk, v)

POL_JSON_OUT.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"Added summaries: {added}, Updated longer summaries: {updated}, Total records: {len(records)}")
missing = len(not_found) - len(best_summary)

Added summaries: 981, Updated longer summaries: 0, Total records: 981


# Merging all categories to a single json file

In [78]:
import json
from pathlib import Path
from copy import deepcopy

In [79]:
ROOT = Path(".")
OUT_PATH = ROOT / "all_fields_merged.json"

SOURCES = [
    ("art_music",   ROOT / "art_music_with_bios_imaged_crawled.json"),
    ("athlete",     ROOT / "athlete_merged_with_bios_imaged_crawled.json"),
    ("poets",       ROOT / "poets_merged_with_bios_imaged_crawled.json"),
    ("scientists",  ROOT / "scientists_merged_with_bios_imaged_crawled.json"),
    ("politicians", ROOT / "politicians_main_with_summaries.json"),
]

In [80]:
def load_records(path: Path):
    """Return a list of records from a JSON that may be a list or dict."""
    obj = json.loads(path.read_text(encoding="utf-8"))
    if isinstance(obj, list):
        return obj
    if isinstance(obj, dict):
        # if it has a common container key, use it; else values()
        for key in ("records", "data", "items", "rows"):
            if isinstance(obj.get(key), list):
                return obj[key]
        return list(obj.values())
    raise ValueError(f"Unsupported JSON root type in {path}")

In [81]:
merged = []
stats = {}

for category, json_path in SOURCES:
    if not json_path.exists():
        print(f"[WARN] missing file: {json_path}")
        stats[category] = 0
        continue

    recs = load_records(json_path)
    stats[category] = len(recs)

    for i, rec in enumerate(recs):
        if not isinstance(rec, dict):
            # wrap non-dict entries so we can still annotate
            rec = {"value": rec}
        r = deepcopy(rec)
        r["category"] = category
        r["field_idx"] = i            # 0-based index within its source file
        merged.append(r)

In [82]:
OUT_PATH.write_text(json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8")

print("Merged counts per category:", stats)
print(f"Total records: {sum(stats.values())}")
print(f"Wrote: {OUT_PATH}")

Merged counts per category: {'art_music': 1338, 'athlete': 3082, 'poets': 289, 'scientists': 241, 'politicians': 981}
Total records: 5931
Wrote: all_fields_merged.json
