# Merging same-category files

In [13]:
import json
from pathlib import Path
from typing import Any, Dict, List, Tuple

In [62]:
def normalize_name(name) -> str:
    """
    Normalize name to a comparable lowercase string.
    - If list: pick the first non-empty string.
    - If non-string: coerce to string.
    """
    if isinstance(name, list):
        for v in name:
            if isinstance(v, str) and v.strip():
                s = v
                break
        else:
            s = ""
    elif isinstance(name, str):
        s = name
    else:
        s = "" if name is None else str(name)
    return s.strip().casefold()

def is_empty(val: Any) -> bool:
    if val is None:
        return True
    if isinstance(val, str) and val.strip() == "":
        return True
    if isinstance(val, (list, tuple, set)) and len(val) == 0:
        return True
    if isinstance(val, dict) and len(val) == 0:
        return True
    return False

def as_image_dict(img: Any) -> Dict[str, Any]:
    """
    Normalize an image field to a dict:
    - dict -> dict
    - list/str/number -> {"_value": <that>}
    - None/empty -> {}
    """
    if isinstance(img, dict):
        return {k: v for k, v in img.items() if not is_empty(v)}
    if is_empty(img):
        return {}
    return {"_value": img}

def dedup_list(lst: List[Any]) -> List[Any]:
    seen = set()
    out = []
    for item in lst:
        key = json.dumps(item, sort_keys=True) if isinstance(item, (dict, list)) else item
        if key not in seen:
            seen.add(key)
            out.append(item)
    return out

def merge_values(v1: Any, v2: Any) -> Any:
    """Best-effort union for values under image keys."""
    if is_empty(v1):
        return v2
    if is_empty(v2):
        return v1

    if isinstance(v1, dict) and isinstance(v2, dict):
        return merge_dicts(v1, v2)

    if isinstance(v1, list) and isinstance(v2, list):
        return dedup_list(v1 + v2)

    if isinstance(v1, list) and not isinstance(v2, list):
        return dedup_list(v1 + [v2])

    if isinstance(v2, list) and not isinstance(v1, list):
        return dedup_list([v1] + v2)

    # scalars unequal -> keep both
    if v1 != v2:
        return dedup_list([v1, v2])

    return v1

def merge_dicts(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
    keys = set(d1) | set(d2)
    out = {}
    for k in keys:
        merged = merge_values(d1.get(k), d2.get(k))
        if not is_empty(merged):
            out[k] = merged
    return out

def merge_image(img1: Any, img2: Any) -> Dict[str, Any]:
    d1 = as_image_dict(img1)
    d2 = as_image_dict(img2)
    return merge_dicts(d1, d2)

def load_by_name(path: str) -> Tuple[Dict[str, dict], List[dict]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    index = {}
    for rec in data:
        if not isinstance(rec, dict):
            continue
        key = normalize_name(rec.get("name"))
        if key:
            index[key] = rec
    return index, data

def write_json(path: str, data: Any):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [15]:
def merge_pair(file_a: str, file_b: str, out_path: str):
    a_idx, a_list = load_by_name(file_a)
    b_idx, b_list = load_by_name(file_b)

    a_keys = set(a_idx)
    b_keys = set(b_idx)
    dup_keys = sorted(a_keys & b_keys)
    only_a  = sorted(a_keys - b_keys)
    only_b  = sorted(b_keys - a_keys)

    merged_records = []

    # Build quick lookups to preserve original 'name' strings
    def original_name(key):
        return (a_idx.get(key) or b_idx.get(key)).get("name")

    # duplicates → keep only name + merged image
    dup_report = []
    both_img = a_only_img = b_only_img = neither_img = 0

    for k in dup_keys:
        ra = a_idx[k]
        rb = b_idx[k]
        img_a = ra.get("image")
        img_b = rb.get("image")

        has_a = not is_empty(as_image_dict(img_a))
        has_b = not is_empty(as_image_dict(img_b))
        if has_a and has_b:
            both_img += 1
        elif has_a and not has_b:
            a_only_img += 1
        elif has_b and not has_a:
            b_only_img += 1
        else:
            neither_img += 1

        merged_img = merge_image(img_a, img_b)
        added_keys = sorted(set(as_image_dict(img_b)) - set(as_image_dict(img_a)))

        merged_records.append({
            "name": ra.get("name") or rb.get("name"),
            "image": merged_img
        })

        dup_report.append({
            "name": ra.get("name") or rb.get("name"),
            "from_file_a": file_a,
            "from_file_b": file_b,
            "image_a_keys": sorted(list(as_image_dict(img_a).keys())),
            "image_b_keys": sorted(list(as_image_dict(img_b).keys())),
            "added_keys_from_b": added_keys,
            "merged_image_keys": sorted(list(merged_img.keys()))
        })

    # uniques → keep record as-is
    for k in only_a:
        merged_records.append(a_idx[k])
    for k in only_b:
        merged_records.append(b_idx[k])

    # Sort final output deterministically by normalized name
    merged_records.sort(key=lambda r: normalize_name(r.get("name")))

    # write output
    write_json(out_path, merged_records)

    # report
    report = {
        "pair": [file_a, file_b],
        "input_counts": {
            Path(file_a).name: len(a_list),
            Path(file_b).name: len(b_list),
        },
        "name_key_stats": {
            "duplicates": len(dup_keys),
            "only_in_" + Path(file_a).name: len(only_a),
            "only_in_" + Path(file_b).name: len(only_b),
        },
        "duplicate_image_breakdown": {
            "both_have_image": both_img,
            "only_first_has_image": a_only_img,
            "only_second_has_image": b_only_img,
            "neither_has_image": neither_img,
        },
        "output_count": len(merged_records),
        "duplicates_detail": dup_report,  # full per-duplicate info
    }

    # pretty print report
    print("\n" + "="*80)
    print(f"Merged: {file_a}  +  {file_b}  →  {out_path}")
    print("-"*80)
    print("Inputs:")
    for k, v in report["input_counts"].items():
        print(f"  {k}: {v}")
    print("Name-key stats:")
    for k, v in report["name_key_stats"].items():
        print(f"  {k}: {v}")
    print("Duplicate image breakdown:")
    for k, v in report["duplicate_image_breakdown"].items():
        print(f"  {k}: {v}")
    print(f"Output records: {report['output_count']}")
    print("-"*80)
    print("Duplicates detail (name | added_keys_from_second | merged_keys):")
    for row in report["duplicates_detail"]:
        print(f"  - {row['name']} | +{row['added_keys_from_b']} | merged={row['merged_image_keys']}")
    print("="*80 + "\n")

    # Also write JSON report next to the merged file (optional but handy)
    report_path = Path(out_path).with_suffix(".report.json")
    write_json(str(report_path), report)

    return report

In [16]:
poets_report = merge_pair("poets20.json",      "poets22.json",      "poets_merged.json")
scis_report  = merge_pair("scientists15.json", "scientists21.json", "scientists_merged.json")


Merged: poets20.json  +  poets22.json  →  poets_merged.json
--------------------------------------------------------------------------------
Inputs:
  poets20.json: 284
  poets22.json: 26
Name-key stats:
  duplicates: 9
  only_in_poets20.json: 263
  only_in_poets22.json: 17
Duplicate image breakdown:
  both_have_image: 9
  only_first_has_image: 0
  only_second_has_image: 0
  neither_has_image: 0
Output records: 289
--------------------------------------------------------------------------------
Duplicates detail (name | added_keys_from_second | merged_keys):
  - احمد شاملو | +['adult'] | merged=['adult', 'tomb', 'young']
  - جامی | +['adult'] | merged=['adult', 'tomb']
  - خواجه عبدالله انصاری | +['adult'] | merged=['adult', 'young']
  - رهی معیری | +[] | merged=['adult', 'tomb']
  - عطار | +['adult'] | merged=['adult', 'tomb']
  - قطران تبریزی | +['adult'] | merged=['adult', 'tomb']
  - مولوی | +['adult'] | merged=['adult', 'tomb', 'young']
  - وصال شیرازی | +['adult'] | merged=['adu

# Merging narratives and add them to json as "summary"

In [17]:
import csv
import json
from pathlib import Path
from collections import defaultdict, Counter

In [23]:
POET_JSON_IN  = "poets_merged.json"
SCIS_JSON_IN  = "scientists_merged.json"
ART_MUSIC_JSON_IN = "art_music.json"
POET_JSON_OUT = "poets_merged_with_bios.json"
SCIS_JSON_OUT = "scientists_merged_with_bios.json"
ART_MUSIC_JSON_OUT = "art_music_with_bios.json"

POET_CSVS = ["poets_biography_1.csv", "poets_biography_2.csv"]
SCIS_CSVS = ["scientists_biography_1.csv", "scientists_biography_2.csv"]
ART_MUSIC_CSVS = ["art_music_biography.csv"]

In [24]:
def normalize_name(name: str) -> str:
    """Use exactly the same normalization you used for JSON merges."""
    return (name or "").strip().casefold()

def extract_name_from_bio(text: str):
    """
    Given a biography string like '{name} ، ...', return (name, biography_text).
    If '،' not found, returns (None, text) so caller can ignore that row.
    """
    if not isinstance(text, str):
        return None, text
    if "،" not in text:
        return None, text
    name = text.split("،", 1)[0].strip()
    return (name if name else None), text.strip()

def iter_bio_texts_from_csv(csv_path: str):
    """
    Read a CSV and yield biography strings. We:
      - prefer any cell that contains '،'
      - otherwise join non-empty cells (but then we require '،' in the final string)
    Rows without '،' are ignored.
    """
    with open(csv_path, "r", encoding="utf-8-sig", newline="") as f:
        rdr = csv.reader(f)
        for row in rdr:
            if not row:
                continue
            # Prefer the cell(s) that contain the Persian comma
            candidates = [c for c in row if isinstance(c, str) and "،" in c]
            if candidates:
                text = max(candidates, key=len).strip()
            else:
                text = " ".join([c for c in row if isinstance(c, str) and c.strip()]).strip()
            if "،" not in text or not text:
                continue
            yield text

def load_bios_grouped_by_name(csv_paths):
    """
    Build dict: normalized_name -> list of biography strings (order preserved).
    If same name appears multiple times across files, we keep all (first-file-first).
    """
    grouped = defaultdict(list)
    order_counter = Counter()
    for p in csv_paths:
        for text in iter_bio_texts_from_csv(p):
            name, bio = extract_name_from_bio(text)
            if not name:
                continue
            key = normalize_name(name)
            grouped[key].append(bio)
            order_counter[key] += 1
    return grouped, order_counter  # second value is how many bios per name (for diagnostics)

def index_json_by_name(json_path: str):
    """Load JSON array and return (records_list, index_by_normalized_name)."""
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    index = {}
    for rec in data:
        nm = rec.get("name")
        if isinstance(nm, str) and nm.strip():
            index[normalize_name(nm)] = rec
    return data, index

def attach_bios(records, index_by_name, bios_by_name):
    """
    Attach biographies to the next available summary field per record:
    summary -> summary2 -> summary3. Stop after summary3.
    Returns a report dict with counts and unfound names.
    """
    added_summary = added_summary2 = added_summary3 = 0
    unfound = []
    skipped_extra = 0  # bios that we couldn't add because summary3 already existed

    for key, bio_list in bios_by_name.items():
        rec = index_by_name.get(key)
        if rec is None:
            unfound.append(key)
            continue

        for bio in bio_list:
            if "summary" not in rec or rec.get("summary") in (None, ""):
                rec["summary"] = bio
                added_summary += 1
            elif "summary2" not in rec or rec.get("summary2") in (None, ""):
                rec["summary2"] = bio
                added_summary2 += 1
            elif "summary3" not in rec or rec.get("summary3") in (None, ""):
                rec["summary3"] = bio
                added_summary3 += 1
            else:
                skipped_extra += 1
                # Do not add beyond summary3

    return {
        "added_summary": added_summary,
        "added_summary2": added_summary2,
        "added_summary3": added_summary3,
        "skipped_extra_bios": skipped_extra,
        "unfound_count": len(unfound),
        "unfound_names_sample": unfound[:20],  # preview a few
    }

def write_json(path: str, data):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def run_pipeline(kind: str, json_in: str, json_out: str, csvs: list):
    print("\n" + "="*88)
    print(f"{kind.upper()} — Merging narratives into JSON")
    print("="*88)

    # 1) Load and group biographies
    bios_by_name, dup_counter = load_bios_grouped_by_name(csvs)
    total_bio_rows = sum(dup_counter.values())
    multi_bio_names = {k: c for k, c in dup_counter.items() if c > 1}

    print(f"CSV inputs: {', '.join(csvs)}")
    print(f"Biographies parsed (rows with '،'): {total_bio_rows}")
    print(f"Unique names in CSVs: {len(bios_by_name)}")
    if multi_bio_names:
        print(f"Names with multiple bios: {len(multi_bio_names)} (first few: "
              f"{', '.join(list(multi_bio_names.keys())[:10])})")

    # 2) Load JSON + index by normalized name
    records, index_by_name = index_json_by_name(json_in)
    print(f"JSON input: {json_in} | records: {len(records)} | indexable by name: {len(index_by_name)}")

    # 3) Attach bios to summary fields
    rep = attach_bios(records, index_by_name, bios_by_name)

    # 4) Save output
    write_json(json_out, records)

    # 5) Report
    print("-"*88)
    print(f"Added summary fields:")
    print(f"  summary : {rep['added_summary']}")
    print(f"  summary2: {rep['added_summary2']}")
    print(f"  summary3: {rep['added_summary3']}")
    print(f"Skipped extra bios (already had summary3): {rep['skipped_extra_bios']}")
    print(f"Unfound narrative records: {rep['unfound_count']}")
    if rep["unfound_names_sample"]:
        print(f"Examples (normalized): {rep['unfound_names_sample']}")
    print(f"Output written to: {json_out}")
    print("="*88 + "\n")

In [25]:
run_pipeline("poets",      POET_JSON_IN, POET_JSON_OUT, POET_CSVS)
run_pipeline("scientists", SCIS_JSON_IN, SCIS_JSON_OUT, SCIS_CSVS)
run_pipeline("art_music",  ART_MUSIC_JSON_IN, ART_MUSIC_JSON_OUT, ART_MUSIC_CSVS)   


POETS — Merging narratives into JSON
CSV inputs: poets_biography_1.csv, poets_biography_2.csv
Biographies parsed (rows with '،'): 733
Unique names in CSVs: 645
Names with multiple bios: 83 (first few: شاه نعمت‌الله ولی, مهدیه الهی قمشه‌ای, رودکی, شکوه قاسم‌نیا, عمعق بخاری, قاسم انوار, بابا فغانی شیرازی, ایرج میرزا, پروین اعتصامی, محمدتقی بهار)
JSON input: poets_merged.json | records: 289 | indexable by name: 289
----------------------------------------------------------------------------------------
Added summary fields:
  summary : 209
  summary2: 56
  summary3: 4
Skipped extra bios (already had summary3): 1
Unfound narrative records: 436
Examples (normalized): ['افضل\u200cالدین بدیل بن علی خاقانی شروانی', 'رکن\u200cالدین اوحدی مراغه\u200cای', 'محمد بن حسام\u200cالدین حسن بن شمس\u200cالدین محمد خوسفی', 'نورالدین عبدالرحمن ابن نظام\u200cالدین احمد ابن محمد', 'بهاءالدین محمد عاملی', 'ابوالحسن فراهانی در شهر فراهان متولد شد و در سال ۱۰۴۰ خورشیدی درگذشت. او در قرن یازدهم هجری به عنوان شا

In [26]:
FILES = [
    "poets_merged_with_bios.json",
    "scientists_merged_with_bios.json",
    "art_music_with_bios.json"
]

def filled(x):
    """True if a value is a non-empty, non-whitespace string."""
    return isinstance(x, str) and x.strip() != ""

def summarize_file(path: Path):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    c_summary = c_summary2 = c_summary3 = c_without_any = 0

    for rec in data:
        s1 = filled(rec.get("summary"))
        s2 = filled(rec.get("summary2"))
        s3 = filled(rec.get("summary3"))

        if s1: c_summary += 1
        if s2: c_summary2 += 1
        if s3: c_summary3 += 1
        if not (s1 or s2 or s3):
            c_without_any += 1

    return {
        "file": path.name,
        "total": len(data),
        "summary": c_summary,
        "summary2": c_summary2,
        "summary3": c_summary3,
        "without_any_summary": c_without_any,
    }

reports = []
for fname in FILES:
    p = Path(fname)
    if p.exists():
        rep = summarize_file(p)
        reports.append(rep)
    else:
        print(f"⚠️  Skipping (not found): {fname}")

# Print per-file and totals
totals = {"total": 0, "summary": 0, "summary2": 0, "summary3": 0, "without_any_summary": 0}

for r in reports:
    print("\n" + "="*70)
    print(f"File: {r['file']}")
    print("-"*70)
    print(f"Total records:           {r['total']}")
    print(f"With summary:            {r['summary']}")
    print(f"With summary2:           {r['summary2']}")
    print(f"With summary3:           {r['summary3']}")
    print(f"Without any summary*:    {r['without_any_summary']}")
    print("="*70)

    for k in totals:
        totals[k] += r[k] if k in r else 0

if reports:
    print("\n" + "#"*70)
    print("AGGREGATE TOTALS (all processed files)")
    print("#"*70)
    print(f"Total records:           {totals['total']}")
    print(f"With summary:            {totals['summary']}")
    print(f"With summary2:           {totals['summary2']}")
    print(f"With summary3:           {totals['summary3']}")
    print(f"Without any summary*:    {totals['without_any_summary']}")
    print("#"*70)
else:
    print("No files processed.")


File: poets_merged_with_bios.json
----------------------------------------------------------------------
Total records:           289
With summary:            209
With summary2:           56
With summary3:           4
Without any summary*:    80

File: scientists_merged_with_bios.json
----------------------------------------------------------------------
Total records:           241
With summary:            132
With summary2:           26
With summary3:           4
Without any summary*:    109

File: art_music_with_bios.json
----------------------------------------------------------------------
Total records:           1338
With summary:            470
With summary2:           178
With summary3:           149
Without any summary*:    868

######################################################################
AGGREGATE TOTALS (all processed files)
######################################################################
Total records:           1868
With summary:            811
With summa

# Try to download images

In [27]:
import os
import re
import json
import time
import random
import urllib.request
import urllib.parse
from io import BytesIO
from PIL import Image
from pathlib import Path

In [28]:
INPUTS = [
    {
        "json_in":  "scientists_merged_with_bios.json",
        "json_out": "scientists_merged_with_bios_imaged.json",
        "img_dir":  "scientists_images",
    },
    {
        "json_in":  "poets_merged_with_bios.json",
        "json_out": "poets_merged_with_bios_imaged.json",
        "img_dir":  "poets_images",
    },
]

In [29]:
FIELDS_WHITELIST = None  # or set to a set of keys

# Polite UA
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}

In [34]:
URL_RE = re.compile(r"^https?://", re.IGNORECASE)
GENERIC_URL_KEYS = {"url", "href", "src", "link"}
IMG_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".tif", ".tiff", ".bmp", ".svg"}

def sanitize_filename(s: str) -> str:
    s = (str(s) or "").strip()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^\w\-.]+", "", s)
    return s or "image"

def http_open(url, timeout=35):
    req = urllib.request.Request(url, headers=HEADERS)
    return urllib.request.urlopen(req, timeout=timeout)

def is_direct_image_url(url: str) -> bool:
    u = urllib.parse.urlparse(url)
    ext = os.path.splitext(u.path)[1].lower()
    return (ext in IMG_EXTS) or ("upload.wikimedia.org" in u.netloc.lower())

def from_wikimedia_thumb_to_original(url: str) -> str:
    parts = urllib.parse.urlparse(url)
    path = parts.path
    if "/thumb/" in path:
        base, tail = path.split("/thumb/", 1)
        segs = tail.split("/")
        if len(segs) >= 2:
            orig_path = base + "/" + "/".join(segs[:-1])
            return urllib.parse.urlunparse(parts._replace(path=orig_path))
    return url

def read_text(url, timeout=35, encoding="utf-8"):
    with http_open(url, timeout) as resp:
        data = resp.read()
    try:
        return data.decode(encoding, errors="ignore")
    except Exception:
        return data.decode("utf-8", errors="ignore")

def wiki_api_request(base, params):
    qs = urllib.parse.urlencode(params)
    url = f"{base}?{qs}"
    try:
        txt = read_text(url)
        return json.loads(txt)
    except Exception:
        return {}

def parse_wiki_title(u):
    parsed = urllib.parse.urlparse(u)
    if "/wiki/" not in parsed.path:
        return parsed.netloc, None
    title = urllib.parse.unquote(parsed.path.split("/wiki/", 1)[1])
    return parsed.netloc, title

def to_canonical_file_title(title):
    if not title or ":" not in title:
        return None
    name = title.split(":", 1)[1]
    return "File:" + name

def resolve_wikipedia_article_image(article_url: str) -> str or None:
    host, title = parse_wiki_title(article_url)
    if not title:
        return None
    api_base = f"https://{host}/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "pageimages",
        "piprop": "original",
        "format": "json",
    }
    data = wiki_api_request(api_base, params)
    pages = (data.get("query") or {}).get("pages") or {}
    for _, pg in pages.items():
        orig = pg.get("original")
        if orig and orig.get("source"):
            return orig["source"]
    return None

# Common localized "File:" prefixes (handles fa: پرونده / فایل)
FILE_NS_PREFIXES = {"file", "پرونده", "فایل", "fichier", "datei", "archivo", "arquivo", "dosya", "файл", "ファイル", "文件", "檔案", "파일"}

def resolve_wiki_file_image(file_page_url: str) -> str or None:
    host, title = parse_wiki_title(file_page_url)
    if not title:
        return None
    prefix = title.split(":", 1)[0].lower()
    looks_file_ns = (":" in title) and (prefix in FILE_NS_PREFIXES or "." in title)
    if looks_file_ns:
        title = to_canonical_file_title(title) or title

    api_base = f"https://{host}/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "imageinfo",
        "iiprop": "url|mime",
        "format": "json",
    }
    data = wiki_api_request(api_base, params)
    pages = (data.get("query") or {}).get("pages") or {}
    for _, pg in pages.items():
        infos = pg.get("imageinfo") or []
        if infos and "url" in infos[0]:
            return infos[0]["url"]
    # Fallback via Special:FilePath (will redirect to the file)
    base_name = title.split(":", 1)[1] if ":" in title else title
    for host_try in (host, "commons.wikimedia.org"):
        c = f"https://{host_try}/wiki/Special:FilePath/{urllib.parse.quote(base_name)}"
        try:
            with http_open(c) as resp:
                ctype = (resp.headers.get("Content-Type") or "").lower()
                if ctype.startswith("image/"):
                    return c
        except Exception:
            pass
    return None

def resolve_og_image(page_url: str) -> str or None:
    try:
        html = read_text(page_url)
    except Exception:
        return None
    m = re.search(r'<meta\s+property=["\']og:image["\']\s+content=["\']([^"\']+)["\']', html, re.IGNORECASE)
    if m:
        img = m.group(1)
        if "upload.wikimedia.org" in img and "/thumb/" in img:
            img = from_wikimedia_thumb_to_original(img)
        return img
    return None

def basename_from_url(url: str) -> str:
    p = urllib.parse.urlparse(url)
    return os.path.basename(p.path)

def svg_to_png_thumbnail(file_name: str, prefer_host: str = "commons.wikimedia.org", width: int = 1200) -> str:
    quoted = urllib.parse.quote(file_name)
    return f"https://{prefer_host}/wiki/Special:FilePath/{quoted}?width={width}"

def resolve_to_direct_images(url: str) -> list:
    """
    Return a list of direct image URLs (best-effort).
    We will try them in order until one downloads successfully.
    """
    out = []
    if is_direct_image_url(url):
        ubase = basename_from_url(url).lower()
        if ubase.endswith(".svg"):
            out.append(svg_to_png_thumbnail(ubase))
        else:
            out.append(from_wikimedia_thumb_to_original(url))
        return out

    u = urllib.parse.urlparse(url)
    host = u.netloc.lower()
    path = u.path
    is_wikipedia = host.endswith(".wikipedia.org")
    is_commons   = (host == "commons.wikimedia.org")

    if (is_wikipedia or is_commons) and "/wiki/" in path:
        f = resolve_wiki_file_image(url)
        if f:
            if f.lower().endswith(".svg"):
                out.append(svg_to_png_thumbnail(basename_from_url(f)))
            else:
                out.append(f)
            return out
        a = resolve_wikipedia_article_image(url)
        if a:
            if a.lower().endswith(".svg"):
                out.append(svg_to_png_thumbnail(basename_from_url(a)))
            else:
                out.append(a)
            return out

    og = resolve_og_image(url)
    if og:
        if og.lower().endswith(".svg"):
            out.append(svg_to_png_thumbnail(basename_from_url(og)))
        else:
            out.append(og)
    return out

def download_image(url, image_path):
    try:
        with http_open(url, timeout=35) as resp:
            final_url = resp.geturl()
            ctype = (resp.headers.get("Content-Type") or "").lower()
            if not ctype.startswith("image/"):
                # try following redirect target explicitly once
                with http_open(final_url, timeout=35) as resp2:
                    ctype2 = (resp2.headers.get("Content-Type") or "").lower()
                    if not ctype2.startswith("image/"):
                        return False
                    data = resp2.read()
            else:
                data = resp.read()

        img = Image.open(BytesIO(data))
        if img.mode not in ("RGB", "L"):
            img = img.convert("RGB")
        elif img.mode == "L":
            img = img.convert("RGB")
        img.save(image_path, format="JPEG", quality=92)
        return True
    except Exception:
        return False

def iter_urls(obj):
    """Yield all URL-like strings found recursively in obj."""
    if obj is None:
        return
    if isinstance(obj, str):
        s = obj.strip()
        if URL_RE.match(s):
            yield s
        return
    if isinstance(obj, dict):
        for v in obj.values():
            yield from iter_urls(v)
    elif isinstance(obj, (list, tuple, set)):
        for v in obj:
            yield from iter_urls(v)

In [35]:
for pack in INPUTS:
    json_in  = Path(pack["json_in"])
    json_out = Path(pack["json_out"])
    img_dir  = Path(pack["img_dir"])
    if not json_in.exists():
        print(f"⚠️  Skipping (missing): {json_in}")
        continue

    img_dir.mkdir(parents=True, exist_ok=True)

    with json_in.open("r", encoding="utf-8") as f:
        records = json.load(f)

    downloaded_true = 0
    downloaded_false = 0

    print("\n" + "="*100)
    print(f"PROCESSING FILE: {json_in.name}")
    print("="*100)

    for idx, rec in enumerate(records):
        name = rec.get("name") or f"record_{idx}"
        name_safe = sanitize_filename(name)
        image_obj = rec.get("image")

        print(f"\n[{idx}] {name}")
        record_ok = False
        any_urls_found = False

        if isinstance(image_obj, dict) and image_obj:
            image_fields = list(image_obj.keys())
            if FIELDS_WHITELIST:
                image_fields = [k for k in image_fields if k in FIELDS_WHITELIST]

            if not image_fields:
                print("  ⚠️ image object present but no fields to check (whitelist filtered all).")

            for field in image_fields:
                field_val = image_obj.get(field)
                if field_val is None or (isinstance(field_val, (str, list, dict)) and not field_val):
                    print(f"  • {field}: ⚠️ field missing/empty")
                    continue

                urls = list(dict.fromkeys(iter_urls(field_val)))  # dedup in-order
                if not urls:
                    print(f"  • {field}: ⚠️ no URL found under this field")
                    continue

                any_urls_found = True
                print(f"  • {field}: ℹ️ found {len(urls)} URL(s) → resolving…")

                saved_this_field = False
                for url in urls:
                    candidates = resolve_to_direct_images(url)
                    if not candidates:
                        print(f"      - {url} → ℹ️ no direct image resolvable")
                        continue

                    print(f"      - {url} → ℹ️ resolved {len(candidates)} candidate(s)")
                    for cand in candidates:
                        filename = f"{idx}_{name_safe}_{sanitize_filename(field)}.jpg"
                        out_path = img_dir / filename
                        ok = download_image(cand, str(out_path))
                        if ok:
                            print(f"         ✅ download OK → {out_path.name}")
                            saved_this_field = True
                            record_ok = True
                            break
                        else:
                            print(f"         ❌ download FAILED for candidate")

                    if saved_this_field:
                        break

                if not saved_this_field:
                    print(f"  • {field}: ❌ URL extracted but download missed for all candidates")

                # polite delay between fields
                time.sleep(random.uniform(0.05, 0.2))

        else:
            print("  ⚠️ image object missing or empty")

        # per-record summary + flag
        rec["is_image_downloaded"] = bool(record_ok)
        if record_ok:
            print("  ▶︎ SUMMARY: ✅ Download COMPLETE for this record")
            downloaded_true += 1
        else:
            if any_urls_found:
                print("  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed")
            else:
                print("  ▶︎ SUMMARY: ❌ URL missed (no usable URLs under any image fields)")
            downloaded_false += 1

        # small delay between records
        time.sleep(random.uniform(0.05, 0.25))

    # write updated JSON
    with json_out.open("w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    total = len(records)
    print("\n" + "-"*100)
    print(f"FILE DONE: {json_in.name}")
    print(f"Images dir: {img_dir}")
    print(f"Output JSON: {json_out.name}")
    print(f"Total records:              {total}")
    print(f"is_image_downloaded = True: {downloaded_true}")
    print(f"is_image_downloaded = False:{downloaded_false}")
    print("-"*100)


PROCESSING FILE: scientists_merged_with_bios.json

[0] Abbas Shafiee
  • young: ℹ️ found 1 URL(s) → resolving…
      - https://www.researchgate.net/profile/Abbas-Shafiee-3 → ℹ️ no direct image resolvable
  • young: ❌ URL extracted but download missed for all candidates
  • adult: ℹ️ found 1 URL(s) → resolving…
      - https://www.researchgate.net/profile/Abbas-Shafiee-3 → ℹ️ no direct image resolvable
  • adult: ❌ URL extracted but download missed for all candidates
  • tomb: ℹ️ found 1 URL(s) → resolving…
      - https://www.researchgate.net/profile/Abbas-Shafiee-3 → ℹ️ no direct image resolvable
  • tomb: ❌ URL extracted but download missed for all candidates
  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed

[1] Abdullah Ibn al-Muqaffa'
  • young: ⚠️ field missing/empty
  • adult: ⚠️ field missing/empty
  • tomb: ⚠️ field missing/empty
  ▶︎ SUMMARY: ❌ URL missed (no usable URLs under any image fields)

[2] Abu Mansur al-Maturidi
  • young: ⚠️ field missing/empty
  • adult: ⚠️ 

In [36]:
ART_IN   = Path("art_music_with_bios.json")
NOURL_IN = Path("no_url_records_art_and_music.json")
ART_OUT  = Path("art_music_with_bios_imaged.json")

def normalize_name(s: str) -> str:
    return (s or "").strip().casefold()

In [37]:
with ART_IN.open("r", encoding="utf-8") as f:
    art_records = json.load(f)

with NOURL_IN.open("r", encoding="utf-8") as f:
    # expected like: [{"10": "name1"}, {"15": "name2"}, ...]
    no_url_rows = json.load(f)

In [38]:
no_url_names = []
no_url_idx_by_name = defaultdict(list)

for row in no_url_rows:
    if isinstance(row, dict):
        for k, v in row.items():
            if isinstance(v, str) and v.strip():
                nm = v.strip()
                key = normalize_name(nm)
                no_url_names.append(key)
                no_url_idx_by_name[key].append(k)
    elif isinstance(row, str) and row.strip():  # just in case any row is directly a name
        key = normalize_name(row)
        no_url_names.append(key)
        no_url_idx_by_name[key].append(None)

no_url_set = set(no_url_names)
no_url_dups = {n for n in no_url_names if no_url_names.count(n) > 1}

In [39]:
index_by_name = {}
for rec in art_records:
    nm = rec.get("name")
    if isinstance(nm, str) and nm.strip():
        index_by_name[normalize_name(nm)] = rec

In [40]:
matched_false = 0
set_true = 0
unfound_in_art = []

for key in no_url_set:
    rec = index_by_name.get(key)
    if rec is None:
        unfound_in_art.append(key)
    else:
        rec["is_image_downloaded"] = False
        matched_false += 1

# All others -> True
for key, rec in index_by_name.items():
    if key not in no_url_set:
        rec["is_image_downloaded"] = True
        set_true += 1

# Note: if any record lacks a usable name (cannot be normalized), set True by instruction “all other records”
missing_name_true = 0
for rec in art_records:
    nm = rec.get("name")
    if not (isinstance(nm, str) and nm.strip()):
        rec["is_image_downloaded"] = True
        missing_name_true += 1

# ---- write output ----
with ART_OUT.open("w", encoding="utf-8") as f:
    json.dump(art_records, f, ensure_ascii=False, indent=2)

# ---- report ----
total = len(art_records)
print("\n" + "="*90)
print("ART & MUSIC — is_image_downloaded assignment report")
print("="*90)
print(f"Input JSON:             {ART_IN.name}")
print(f"No-URL JSON:            {NOURL_IN.name}")
print("-"*90)
print(f"Total art/music records:        {total}")
print(f"Names in no-URL list:           {len(no_url_set)} (raw rows: {len(no_url_rows)})")
print(f"Duplicate names in no-URL list: {len(no_url_dups)}")
print("-"*90)
print(f"Set is_image_downloaded = False (matched): {matched_false}")
print(f"Set is_image_downloaded = True  (others):  {set_true}")
print(f"Records with missing/blank name set to True: {missing_name_true}")
print(f"Unfound names from no-URL list: {len(unfound_in_art)}")
if unfound_in_art:
    sample = unfound_in_art[:20]
    print(f"  e.g. (normalized): {sample}")
print("-"*90)
print(f"Output written to: {ART_OUT.name}")
print("="*90)


ART & MUSIC — is_image_downloaded assignment report
Input JSON:             art_music_with_bios.json
No-URL JSON:            no_url_records_art_and_music.json
------------------------------------------------------------------------------------------
Total art/music records:        1338
Names in no-URL list:           54 (raw rows: 56)
Duplicate names in no-URL list: 2
------------------------------------------------------------------------------------------
Set is_image_downloaded = False (matched): 54
Set is_image_downloaded = True  (others):  1214
Records with missing/blank name set to True: 0
Unfound names from no-URL list: 0
------------------------------------------------------------------------------------------
Output written to: art_music_with_bios_imaged.json


# Crawling for False "is_image_downloaded" records' images

In [56]:
import os
import re
import json
import time
import random
import urllib.request
import urllib.parse
from io import BytesIO
from pathlib import Path
from PIL import Image

In [57]:
INPUTS = [
    {
        "json_in":  "scientists_merged_with_bios_imaged.json",
        "json_out": "scientists_merged_with_bios_imaged_crawled.json",
        "img_dir":  "scientists_images",
    },
    {
        "json_in":  "poets_merged_with_bios_imaged.json",
        "json_out": "poets_merged_with_bios_imaged_crawled.json",
        "img_dir":  "poets_images",
    },
    {
        "json_in":  "art_music_with_bios_imaged.json",
        "json_out": "art_music_with_bios_imaged_crawled.json",
        "img_dir":  "art_music_images",
    },
]

In [58]:
MAX_IMAGE_CANDIDATES = 10

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}

In [59]:
def sanitize_filename(s: str) -> str:
    s = (str(s) or "").strip()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^\w\-.]+", "", s)
    return s or "image"

def http_open(url, timeout=35):
    req = urllib.request.Request(url, headers=HEADERS)
    return urllib.request.urlopen(req, timeout=timeout)

def download_image_to_jpeg(url, image_path) -> bool:
    try:
        with http_open(url, timeout=35) as resp:
            final_url = resp.geturl()
            ctype = (resp.headers.get("Content-Type") or "").lower()
            if not ctype.startswith("image/"):
                with http_open(final_url, timeout=35) as resp2:
                    ctype2 = (resp2.headers.get("Content-Type") or "").lower()
                    if not ctype2.startswith("image/"):
                        return False
                    data = resp2.read()
            else:
                data = resp.read()

        img = Image.open(BytesIO(data))
        if img.mode not in ("RGB", "L"):
            img = img.convert("RGB")
        elif img.mode == "L":
            img = img.convert("RGB")
        img.save(image_path, format="JPEG", quality=92)
        return True
    except Exception:
        return False

# ---------- Google providers (optional; use if keys set) ----------

def google_images_serpapi(query: str, num: int) -> list:
    api_key = os.getenv("SERPAPI_API_KEY")
    if not api_key:
        return []
    params = {"engine": "google_images", "q": query, "api_key": api_key, "ijn": "0", "safe": "active"}
    url = "https://serpapi.com/search.json?" + urllib.parse.urlencode(params)
    try:
        with http_open(url, timeout=35) as resp:
            data = json.loads(resp.read().decode("utf-8", errors="ignore"))
        results = data.get("images_results") or []
        out, seen = [], set()
        for r in results:
            u = r.get("original") or r.get("thumbnail")
            if u and u.startswith("http") and u not in seen:
                seen.add(u)
                out.append(u)
            if len(out) >= num:
                break
        return out
    except Exception:
        return []

def google_images_cse(query: str, num: int) -> list:
    api_key = os.getenv("GOOGLE_API_KEY")
    cse_id  = os.getenv("GOOGLE_CSE_ID")
    if not api_key or not cse_id:
        return []
    params = {"q": query, "key": api_key, "cx": cse_id, "searchType": "image", "num": min(num, 10), "safe": "active"}
    url = "https://www.googleapis.com/customsearch/v1?" + urllib.parse.urlencode(params)
    try:
        with http_open(url, timeout=35) as resp:
            data = json.loads(resp.read().decode("utf-8", errors="ignore"))
        items = data.get("items") or []
        out, seen = [], set()
        for it in items:
            link = it.get("link")
            if link and link.startswith("http") and link not in seen:
                seen.add(link)
                out.append(link)
        return out[:num]
    except Exception:
        return []

# ---------- Wikimedia fallback (no keys needed) ----------

IMG_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".tif", ".tiff", ".bmp", ".svg"}

def read_json_url(url, timeout=35):
    with http_open(url, timeout=timeout) as resp:
        return json.loads(resp.read().decode("utf-8", errors="ignore"))

def basename_from_url(url: str) -> str:
    p = urllib.parse.urlparse(url)
    return os.path.basename(p.path)

def svg_to_png_thumbnail(file_name: str, prefer_host: str = "commons.wikimedia.org", width: int = 1200) -> str:
    quoted = urllib.parse.quote(file_name)
    return f"https://{prefer_host}/wiki/Special:FilePath/{quoted}?width={width}"

def wikipedia_search_first_title(host: str, name: str) -> str | None:
    # try exact title first
    exact = f"https://{host}/w/api.php?action=query&titles={urllib.parse.quote(name)}&format=json"
    try:
        data = read_json_url(exact)
        pages = (data.get("query") or {}).get("pages") or {}
        if pages and list(pages.keys())[0] != "-1":
            # found exact
            return list(pages.values())[0].get("title")
    except Exception:
        pass
    # fallback: search
    url = f"https://{host}/w/api.php?action=query&list=search&srsearch={urllib.parse.quote(name)}&srlimit=1&format=json"
    try:
        data = read_json_url(url)
        hits = (data.get("query") or {}).get("search") or []
        if hits:
            return hits[0].get("title")
    except Exception:
        pass
    return None

def wikipedia_lead_image(host: str, title: str) -> str | None:
    url = f"https://{host}/w/api.php?action=query&titles={urllib.parse.quote(title)}&prop=pageimages&piprop=original&format=json"
    try:
        data = read_json_url(url)
        pages = (data.get("query") or {}).get("pages") or {}
        for _, pg in pages.items():
            orig = pg.get("original")
            if orig and orig.get("source"):
                return orig["source"]
    except Exception:
        pass
    return None

def commons_file_urls_from_search(name: str, limit: int = 5) -> list:
    # namespace 6 = File:
    url = f"https://commons.wikimedia.org/w/api.php?action=query&list=search&srnamespace=6&srsearch={urllib.parse.quote(name)}&srlimit={limit}&format=json"
    out = []
    try:
        data = read_json_url(url)
        hits = (data.get("query") or {}).get("search") or []
        titles = [h.get("title") for h in hits if h.get("title")]
        if not titles:
            return []
        # get imageinfo for titles
        titles_q = "|".join(titles)
        info = f"https://commons.wikimedia.org/w/api.php?action=query&titles={urllib.parse.quote(titles_q)}&prop=imageinfo&iiprop=url&format=json"
        data2 = read_json_url(info)
        pages = (data2.get("query") or {}).get("pages") or {}
        for _, pg in pages.items():
            ii = pg.get("imageinfo") or []
            if ii and ii[0].get("url"):
                out.append(ii[0]["url"])
    except Exception:
        return []
    return out

def wikimedia_image_candidates(name: str, num: int) -> list:
    """
    Try fa.wikipedia → en.wikipedia lead image → Commons file search.
    Convert SVGs to PNG thumbnails.
    """
    urls = []

    for host in ("fa.wikipedia.org", "en.wikipedia.org"):
        title = wikipedia_search_first_title(host, name)
        if title:
            img = wikipedia_lead_image(host, title)
            if img:
                if img.lower().endswith(".svg"):
                    urls.append(svg_to_png_thumbnail(basename_from_url(img)))
                else:
                    urls.append(img)
                if len(urls) >= num:
                    break

    if len(urls) < num:
        commons = commons_file_urls_from_search(name, limit=max(3, num))
        for u in commons:
            if u.lower().endswith(".svg"):
                urls.append(svg_to_png_thumbnail(basename_from_url(u)))
            else:
                urls.append(u)
            if len(urls) >= num:
                break

    # unique in order
    seen, uniq = set(), []
    for u in urls:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq[:num]

# Unified selector
def image_candidates_by_provider(query: str, num: int, provider: str) -> list:
    if provider == "SerpAPI":
        c = google_images_serpapi(query, num)
        if c: return c
        # fall through to CSE then Wikimedia
        provider = "CSE"
    if provider == "CSE":
        c = google_images_cse(query, num)
        if c: return c
        provider = "Wikimedia"
    # Wikimedia works with no keys
    return wikimedia_image_candidates(query, num)

In [60]:
for pack in INPUTS:
    json_in  = Path(pack["json_in"])
    json_out = Path(pack["json_out"])
    img_dir  = Path(pack["img_dir"])

    if not json_in.exists():
        print(f"⚠️  Skipping (missing): {json_in}")
        continue

    img_dir.mkdir(parents=True, exist_ok=True)

    with json_in.open("r", encoding="utf-8") as f:
        records = json.load(f)

    total = len(records)
    need_crawl = [i for i, r in enumerate(records) if not r.get("is_image_downloaded", False)]

    # Decide starting provider label for logs
    start_provider = "SerpAPI" if os.getenv("SERPAPI_API_KEY") else ("CSE" if os.getenv("GOOGLE_API_KEY") and os.getenv("GOOGLE_CSE_ID") else "Wikimedia")

    print("\n" + "="*110)
    print(f"CRAWLING FILE: {json_in.name}  |  Provider: {start_provider} (fallbacks enabled)")
    print("="*110)
    print(f"Total records: {total} | Need crawl: {len(need_crawl)}")

    success = 0
    failed  = 0

    for idx in need_crawl:
        rec  = records[idx]
        name = rec.get("name") or f"record_{idx}"
        name_safe = sanitize_filename(name)
        print(f"\n[{idx}] {name}  → searching images…")

        candidates = image_candidates_by_provider(name, MAX_IMAGE_CANDIDATES, start_provider)
        if not candidates:
            print("  ⚠️ No candidates found by any provider.")
            rec["is_image_downloaded"] = False
            failed += 1
            time.sleep(random.uniform(0.2, 0.5))
            continue

        print(f"  ℹ️ {len(candidates)} candidate(s) found")
        saved = False
        out_path = img_dir / f"{idx}_{name_safe}_crawled.jpg"

        for ci, url in enumerate(candidates, start=1):
            ok = download_image_to_jpeg(url, str(out_path))
            if ok:
                if not isinstance(rec.get("image"), dict):
                    rec["image"] = {}
                rec["image"]["crawled"] = url
                rec["is_image_downloaded"] = True
                print(f"  ✅ Candidate {ci}/{len(candidates)} OK → {out_path.name}")
                saved = True
                success += 1
                break
            else:
                print(f"  ❌ Candidate {ci}/{len(candidates)} FAILED")

            time.sleep(random.uniform(0.15, 0.35))

        if not saved:
            rec["is_image_downloaded"] = False
            print("  ▶︎ RESULT: ❌ All candidates failed")
            failed += 1

        time.sleep(random.uniform(0.25, 0.6))

    # Write output JSON
    with json_out.open("w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    # Final per-file report
    print("\n" + "-"*110)
    print(f"FILE DONE: {json_in.name}")
    print(f"Output JSON: {json_out.name}")
    print(f"Images dir:  {img_dir}")
    print("-"*110)
    print(f"Needed crawl: {len(need_crawl)}")
    print(f"Succeeded:    {success}")
    print(f"Failed:       {failed}")
    print("-"*110)

print("\nAll files processed.")


CRAWLING FILE: scientists_merged_with_bios_imaged.json  |  Provider: Wikimedia (fallbacks enabled)
Total records: 241 | Need crawl: 105

[0] Abbas Shafiee  → searching images…
  ℹ️ 2 candidate(s) found
  ✅ Candidate 1/2 OK → 0_Abbas_Shafiee_crawled.jpg

[1] Abdullah Ibn al-Muqaffa'  → searching images…
  ℹ️ 4 candidate(s) found
  ✅ Candidate 1/4 OK → 1_Abdullah_Ibn_al-Muqaffa_crawled.jpg

[2] Abu Mansur al-Maturidi  → searching images…
  ℹ️ 8 candidate(s) found
  ✅ Candidate 1/8 OK → 2_Abu_Mansur_al-Maturidi_crawled.jpg

[3] Abu Sa'id 'Ubayd Allah ibn Bakhtishu  → searching images…
  ℹ️ 6 candidate(s) found
  ✅ Candidate 1/6 OK → 3_Abu_Said_Ubayd_Allah_ibn_Bakhtishu_crawled.jpg

[4] Abu Zayd Ahmed ibn Sahl Balkhi  → searching images…
  ℹ️ 2 candidate(s) found
  ✅ Candidate 1/2 OK → 4_Abu_Zayd_Ahmed_ibn_Sahl_Balkhi_crawled.jpg

[5] Abū Sulaimān al-Maqdisī  → searching images…
  ℹ️ 1 candidate(s) found
  ✅ Candidate 1/1 OK → 5_Abū_Sulaimān_al-Maqdisī_crawled.jpg

[6] Ahmad Parsa  → sear

# Athlete 2 files

In [63]:
Athlete_report = merge_pair("athlete17.json", "athlete19.json", "Athlete_merged.json")


Merged: athlete17.json  +  athlete19.json  →  Athlete_merged.json
--------------------------------------------------------------------------------
Inputs:
  athlete17.json: 272
  athlete19.json: 3068
Name-key stats:
  duplicates: 239
  only_in_athlete17.json: 33
  only_in_athlete19.json: 2810
Duplicate image breakdown:
  both_have_image: 239
  only_first_has_image: 0
  only_second_has_image: 0
  neither_has_image: 0
Output records: 3082
--------------------------------------------------------------------------------
Duplicates detail (name | added_keys_from_second | merged_keys):
  - آرشام یسایی | +[] | merged=['_value']
  - آرمین تشکری | +[] | merged=['_value']
  - آرن داوودی | +[] | merged=['_value']
  - آلمدین زیلیکیچ | +[] | merged=['_value']
  - آلکس آغاسی | +[] | merged=['_value']
  - ابوالفضل انوری | +[] | merged=['_value']
  - ابوالفضل شهرجردی | +[] | merged=['_value']
  - احسان حدادی | +[] | merged=['_value']
  - احسان حسینی | +[] | merged=['_value']
  - احسان روزبهانی | +[] 

In [64]:
ATHLETE_JSON_IN = "Athlete_merged.json"
ATHLETE_JSON_OUT = "athlete_merged_with_bios.json"

ATHLETE_CSVS = ["athlete_biography_1.csv", "athlete_biography_2.csv"]

In [65]:
run_pipeline("athlete", ATHLETE_JSON_IN, ATHLETE_JSON_OUT, ATHLETE_CSVS)


ATHLETE — Merging narratives into JSON
CSV inputs: athlete_biography_1.csv, athlete_biography_2.csv
Biographies parsed (rows with '،'): 5881
Unique names in CSVs: 5293
Names with multiple bios: 569 (first few: کوین بولی, بهرام مشتاقی, محمد چهارمحالی, فرهاد پیروت‌پور, رشاد صادقوف, کیانوش رحمتی, علی تاجرنیا, اکبر میثاقیان, حمیدرضا زوهانی, ادمیر آدرویچ)
JSON input: Athlete_merged.json | records: 3082 | indexable by name: 3081
----------------------------------------------------------------------------------------
Added summary fields:
  summary : 3064
  summary2: 376
  summary3: 12
Skipped extra bios (already had summary3): 3
Unfound narrative records: 2229
Examples (normalized): ['کوین بولی', 'محمد چهارمحالی', 'علا عبدالزَهره', 'مهران عالیقدر', 'احسان جودکی', 'مهرشاد مؤمنی', 'بهروز رهبر', 'محمدرضا تندروان', 'سعید بهاءلو هوره', 'اسحاق سبحانی', 'سعید امرایی', 'fábio carvalho', 'آرش شرقی', 'حمید سلطانی', 'هوشنگ ملک\u200cلو', 'آدمار براگا', 'علیرضا قدیری', 'هاشم صیامی', 'پوریا کیاشمشکی', 'ا

In [66]:
FILES = ["athlete_merged_with_bios.json"]

reports = []
for fname in FILES:
    p = Path(fname)
    if p.exists():
        rep = summarize_file(p)
        reports.append(rep)
    else:
        print(f"⚠️  Skipping (not found): {fname}")

# Print per-file and totals
totals = {"total": 0, "summary": 0, "summary2": 0, "summary3": 0, "without_any_summary": 0}

for r in reports:
    print("\n" + "="*70)
    print(f"File: {r['file']}")
    print("-"*70)
    print(f"Total records:           {r['total']}")
    print(f"With summary:            {r['summary']}")
    print(f"With summary2:           {r['summary2']}")
    print(f"With summary3:           {r['summary3']}")
    print(f"Without any summary*:    {r['without_any_summary']}")
    print("="*70)

    for k in totals:
        totals[k] += r[k] if k in r else 0

if reports:
    print("\n" + "#"*70)
    print("AGGREGATE TOTALS (all processed files)")
    print("#"*70)
    print(f"Total records:           {totals['total']}")
    print(f"With summary:            {totals['summary']}")
    print(f"With summary2:           {totals['summary2']}")
    print(f"With summary3:           {totals['summary3']}")
    print(f"Without any summary*:    {totals['without_any_summary']}")
    print("#"*70)
else:
    print("No files processed.")


File: athlete_merged_with_bios.json
----------------------------------------------------------------------
Total records:           3082
With summary:            3064
With summary2:           376
With summary3:           12
Without any summary*:    18

######################################################################
AGGREGATE TOTALS (all processed files)
######################################################################
Total records:           3082
With summary:            3064
With summary2:           376
With summary3:           12
Without any summary*:    18
######################################################################


In [67]:
INPUTS = [
    {
        "json_in":  "athlete_merged_with_bios.json",
        "json_out": "athlete_merged_with_bios_imaged.json",
        "img_dir":  "athlete_images",
    }
]

In [69]:
for idx, rec in enumerate(records):
    name = rec.get("name") or f"record_{idx}"
    name_safe = sanitize_filename(name)
    image_obj = rec.get("image")

    print(f"\n[{idx}] {name}")
    record_ok = False
    any_urls_found = False

    if isinstance(image_obj, dict) and image_obj:
        # (UNCHANGED) original dict-style handling (young/adult/tomb, etc.)
        image_fields = list(image_obj.keys())
        if FIELDS_WHITELIST:
            image_fields = [k for k in image_fields if k in FIELDS_WHITELIST]

        if not image_fields:
            print("  ⚠️ image object present but no fields to check (whitelist filtered all).")

        for field in image_fields:
            field_val = image_obj.get(field)
            if field_val is None or (isinstance(field_val, (str, list, dict)) and not field_val):
                print(f"  • {field}: ⚠️ field missing/empty")
                continue

            urls = list(dict.fromkeys(iter_urls(field_val)))  # dedup in-order
            if not urls:
                print(f"  • {field}: ⚠️ no URL found under this field")
                continue

            any_urls_found = True
            print(f"  • {field}: ℹ️ found {len(urls)} URL(s) → resolving…")

            saved_this_field = False
            for url in urls:
                candidates = resolve_to_direct_images(url)
                if not candidates:
                    print(f"      - {url} → ℹ️ no direct image resolvable")
                    continue

                print(f"      - {url} → ℹ️ resolved {len(candidates)} candidate(s)")
                for cand in candidates:
                    filename = f"{idx}_{name_safe}_{sanitize_filename(field)}.jpg"
                    out_path = img_dir / filename
                    ok = download_image(cand, str(out_path))
                    if ok:
                        print(f"         ✅ download OK → {out_path.name}")
                        saved_this_field = True
                        record_ok = True
                        break
                    else:
                        print(f"         ❌ download FAILED for candidate")

                if saved_this_field:
                    break

            if not saved_this_field:
                print(f"  • {field}: ❌ URL extracted but download missed for all candidates")

            time.sleep(random.uniform(0.05, 0.2))

    # ===== ADDED: handle a single URL string in image =====
    elif isinstance(image_obj, str):
        urls = list(dict.fromkeys(iter_urls(image_obj)))  # will yield the string if it looks like a URL
        if not urls:
            print("  • image(str): ⚠️ no URL found in string value")
        else:
            any_urls_found = True
            print(f"  • image(str): ℹ️ found {len(urls)} URL(s) → resolving…")
            for url in urls:
                candidates = resolve_to_direct_images(url)
                if not candidates:
                    print(f"      - {url} → ℹ️ no direct image resolvable")
                    continue
                print(f"      - {url} → ℹ️ resolved {len(candidates)} candidate(s)")
                for cand in candidates:
                    filename = f"{idx}_{name_safe}_image.jpg"
                    out_path = img_dir / filename
                    ok = download_image(cand, str(out_path))
                    if ok:
                        print(f"         ✅ download OK → {out_path.name}")
                        record_ok = True
                        break
                    else:
                        print("         ❌ download FAILED for candidate")
                if record_ok:
                    break
            time.sleep(random.uniform(0.05, 0.2))

    # ===== ADDED: handle a list/tuple/set of URL strings =====
    elif isinstance(image_obj, (list, tuple, set)):
        list_urls = list(dict.fromkeys(iter_urls(list(image_obj))))  # de-dup in order
        if not list_urls:
            print("  • image(list): ⚠️ no URL(s) found in list")
        else:
            any_urls_found = True
            print(f"  • image(list): ℹ️ found {len(list_urls)} URL(s) → resolving…")
            for i, url in enumerate(list_urls, start=1):
                candidates = resolve_to_direct_images(url)
                if not candidates:
                    print(f"      - {url} → ℹ️ no direct image resolvable")
                    continue
                print(f"      - {url} → ℹ️ resolved {len(candidates)} candidate(s)")
                for cand in candidates:
                    filename = f"{idx}_{name_safe}_image{i}.jpg"
                    out_path = img_dir / filename
                    ok = download_image(cand, str(out_path))
                    if ok:
                        print(f"         ✅ download OK → {out_path.name}")
                        record_ok = True
                        break
                    else:
                        print("         ❌ download FAILED for candidate")
                if record_ok:
                    break
            time.sleep(random.uniform(0.05, 0.2))

    else:
        print("  ⚠️ image field missing/empty or unsupported type")

    # per-record summary + flag (UNCHANGED)
    rec["is_image_downloaded"] = bool(record_ok)
    if record_ok:
        print("  ▶︎ SUMMARY: ✅ Download COMPLETE for this record")
        downloaded_true += 1
    else:
        if any_urls_found:
            print("  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed")
        else:
            print("  ▶︎ SUMMARY: ❌ URL missed (no usable URLs)")
        downloaded_false += 1

    time.sleep(random.uniform(0.05, 0.25))



[0] Hellper/صفحه تمرین
  • image(str): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedia.org/wikipedia/commons/thumb/c/ca/Flag_of_Iran.svg/40px-Flag_of_Iran.svg.png → ℹ️ resolved 1 candidate(s)
         ❌ download FAILED for candidate
  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed

[1] Jorvan Vieira
  • image(list): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/Jorvan_Vieira.jpg/250px-Jorvan_Vieira.jpg → ℹ️ resolved 1 candidate(s)
         ✅ download OK → 1_Jorvan_Vieira_image1.jpg
  ▶︎ SUMMARY: ✅ Download COMPLETE for this record

[2] آتسو استویکوف
  • image(str): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/20170621_Admira_Wacker_vs_Vardar_Skopje_Aco_Stojkov_DSC_6107.jpg/250px-20170621_Admira_Wacker_vs_Vardar_Skopje_Aco_Stojkov_DSC_6107.jpg → ℹ️ resolved 1 candidate(s)
         ✅ download OK → 2_آتسو_استویکوف_image.jpg
  ▶︎ SUMMARY: ✅ Download COMPLETE for thi

KeyboardInterrupt: 

In [70]:
for idx, rec in enumerate(records):
    name = rec.get("name") or f"record_{idx}"
    name_safe = sanitize_filename(name)
    image_obj = rec.get("image")

    # NEW: skip records already downloaded in a previous run
    if rec.get("is_image_downloaded", False) is True:
        print(f"\n[{idx}] {name}  ▶︎ already downloaded — skipping")
        downloaded_true += 1   # (or use a separate 'skipped' counter if you prefer)
        continue

    print(f"\n[{idx}] {name}")
    record_ok = False
    any_urls_found = False

    if isinstance(image_obj, dict) and image_obj:
        # (UNCHANGED) original dict-style handling (young/adult/tomb, etc.)
        image_fields = list(image_obj.keys())
        if FIELDS_WHITELIST:
            image_fields = [k for k in image_fields if k in FIELDS_WHITELIST]

        if not image_fields:
            print("  ⚠️ image object present but no fields to check (whitelist filtered all).")

        for field in image_fields:
            field_val = image_obj.get(field)
            if field_val is None or (isinstance(field_val, (str, list, dict)) and not field_val):
                print(f"  • {field}: ⚠️ field missing/empty")
                continue

            urls = list(dict.fromkeys(iter_urls(field_val)))  # dedup in-order
            if not urls:
                print(f"  • {field}: ⚠️ no URL found under this field")
                continue

            any_urls_found = True
            print(f"  • {field}: ℹ️ found {len(urls)} URL(s) → resolving…")

            saved_this_field = False
            for url in urls:
                candidates = resolve_to_direct_images(url)
                if not candidates:
                    print(f"      - {url} → ℹ️ no direct image resolvable")
                    continue

                print(f"      - {url} → ℹ️ resolved {len(candidates)} candidate(s)")
                for cand in candidates:
                    filename = f"{idx}_{name_safe}_{sanitize_filename(field)}.jpg"
                    out_path = img_dir / filename
                    ok = download_image(cand, str(out_path))
                    if ok:
                        print(f"         ✅ download OK → {out_path.name}")
                        saved_this_field = True
                        record_ok = True
                        break
                    else:
                        print(f"         ❌ download FAILED for candidate")

                if saved_this_field:
                    break

            if not saved_this_field:
                print(f"  • {field}: ❌ URL extracted but download missed for all candidates")

            time.sleep(random.uniform(0.05, 0.2))

    # ===== handle a single URL string in image =====
    elif isinstance(image_obj, str):
        urls = list(dict.fromkeys(iter_urls(image_obj)))  # will yield the string if it looks like a URL
        if not urls:
            print("  • image(str): ⚠️ no URL found in string value")
        else:
            any_urls_found = True
            print(f"  • image(str): ℹ️ found {len(urls)} URL(s) → resolving…")
            for url in urls:
                candidates = resolve_to_direct_images(url)
                if not candidates:
                    print(f"      - {url} → ℹ️ no direct image resolvable")
                    continue
                print(f"      - {url} → ℹ️ resolved {len(candidates)} candidate(s)")
                for cand in candidates:
                    filename = f"{idx}_{name_safe}_image.jpg"
                    out_path = img_dir / filename
                    ok = download_image(cand, str(out_path))
                    if ok:
                        print(f"         ✅ download OK → {out_path.name}")
                        record_ok = True
                        break
                    else:
                        print("         ❌ download FAILED for candidate")
                if record_ok:
                    break
            time.sleep(random.uniform(0.05, 0.2))

    # ===== handle a list/tuple/set of URL strings =====
    elif isinstance(image_obj, (list, tuple, set)):
        list_urls = list(dict.fromkeys(iter_urls(list(image_obj))))  # de-dup in order
        if not list_urls:
            print("  • image(list): ⚠️ no URL(s) found in list")
        else:
            any_urls_found = True
            print(f"  • image(list): ℹ️ found {len(list_urls)} URL(s) → resolving…")
            for i, url in enumerate(list_urls, start=1):
                candidates = resolve_to_direct_images(url)
                if not candidates:
                    print(f"      - {url} → ℹ️ no direct image resolvable")
                    continue
                print(f"      - {url} → ℹ️ resolved {len(candidates)} candidate(s)")
                for cand in candidates:
                    filename = f"{idx}_{name_safe}_image{i}.jpg"
                    out_path = img_dir / filename
                    ok = download_image(cand, str(out_path))
                    if ok:
                        print(f"         ✅ download OK → {out_path.name}")
                        record_ok = True
                        break
                    else:
                        print("         ❌ download FAILED for candidate")
                if record_ok:
                    break
            time.sleep(random.uniform(0.05, 0.2))

    else:
        print("  ⚠️ image field missing/empty or unsupported type")

    # per-record summary + flag (UNCHANGED)
    rec["is_image_downloaded"] = bool(record_ok)
    if record_ok:
        print("  ▶︎ SUMMARY: ✅ Download COMPLETE for this record")
        downloaded_true += 1
    else:
        if any_urls_found:
            print("  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed")
        else:
            print("  ▶︎ SUMMARY: ❌ URL missed (no usable URLs)")
        downloaded_false += 1

    time.sleep(random.uniform(0.05, 0.25))


[0] Hellper/صفحه تمرین
  • image(str): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedia.org/wikipedia/commons/thumb/c/ca/Flag_of_Iran.svg/40px-Flag_of_Iran.svg.png → ℹ️ resolved 1 candidate(s)
         ❌ download FAILED for candidate
  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed

[1] Jorvan Vieira  ▶︎ already downloaded — skipping

[2] آتسو استویکوف  ▶︎ already downloaded — skipping

[3] آتنا محمدی  ▶︎ already downloaded — skipping

[4] آتوسا پورکاشیان  ▶︎ already downloaded — skipping

[5] آتوسا گلشادنژاد
  • image(str): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedia.org/wikipedia/commons/thumb/c/ca/Flag_of_Iran.svg/40px-Flag_of_Iran.svg.png → ℹ️ resolved 1 candidate(s)
         ❌ download FAILED for candidate
  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed

[6] آتیلا حجازی  ▶︎ already downloaded — skipping

[7] آدام جمیلی  ▶︎ already downloaded — skipping

[8] آدام همتی  ▶︎ already downloaded — skipping

[9] آدریانو آلوز  ▶︎ already do



         ✅ download OK → 1844_فهرست_قهرمانان_لیگ_فوتبال_زنان_ایران_image.jpg
  ▶︎ SUMMARY: ✅ Download COMPLETE for this record

[1845] فهرست مربیان باشگاه فوتبال سپاهان
  • image(str): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedia.org/wikipedia/commons/thumb/2/28/14031013_Patrice_Carteron_%28cropped%29.jpg/250px-14031013_Patrice_Carteron_%28cropped%29.jpg → ℹ️ resolved 1 candidate(s)
         ✅ download OK → 1845_فهرست_مربیان_باشگاه_فوتبال_سپاهان_image.jpg
  ▶︎ SUMMARY: ✅ Download COMPLETE for this record

[1846] فهرست مربیان باشگاه فوتبال پرسپولیس
  • image(str): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedia.org/wikipedia/fa/thumb/0/05/FC_Persepolis_Official_Logo.svg/120px-FC_Persepolis_Official_Logo.svg.png → ℹ️ resolved 1 candidate(s)
         ❌ download FAILED for candidate
  ▶︎ SUMMARY: ❌ URL extracted but ALL downloads failed

[1847] فهرست مربیان تیم ملی فوتبال ایران
  • image(str): ℹ️ found 1 URL(s) → resolving…
      - https://upload.wikimedi

In [75]:
need_crawl_count = sum(1 for r in records if not r.get("is_image_downloaded", False))
skipped_already_true = 0

for idx, rec in enumerate(records):
    name = rec.get("name") or f"record_{idx}"
    name_safe = sanitize_filename(name)

    # NEW: skip already-downloaded records (no re-download on reruns)
    if rec.get("is_image_downloaded", False) is True:
        print(f"\n[{idx}] {name}  ▶︎ already downloaded — skipping")
        skipped_already_true += 1
        continue

    query = name
    print(f"\n[{idx}] {name}  → searching images with query: {query} …")

    candidates = image_candidates_by_provider(query, MAX_IMAGE_CANDIDATES, start_provider)
    if not candidates:
        print("  ⚠️ No candidates found by any provider.")
        rec["is_image_downloaded"] = False
        failed += 1
        time.sleep(random.uniform(0.2, 0.5))
        continue

    print(f"  ℹ️ {len(candidates)} candidate(s) found")
    saved = False
    out_path = img_dir / f"{idx}_{name_safe}_crawled.jpg"

    for ci, url in enumerate(candidates, start=1):
        ok = download_image_to_jpeg(url, str(out_path))
        if ok:
            # --- CHANGED (preserve image value) ---
            img = rec.get("image")
            if isinstance(img, dict):
                img["crawled"] = url
            elif isinstance(img, (str, list)):
                rec["image"] = {"_value": img, "crawled": url}
            else:
                rec["image"] = {"crawled": url}

            rec["is_image_downloaded"] = True
            print(f"  ✅ Candidate {ci}/{len(candidates)} OK → {out_path.name}")
            saved = True
            success += 1
            break
        else:
            print(f"  ❌ Candidate {ci}/{len(candidates)} FAILED")
        time.sleep(random.uniform(0.15, 0.35))

    if not saved:
        rec["is_image_downloaded"] = False
        print("  ▶︎ RESULT: ❌ All candidates failed")
        failed += 1

    time.sleep(random.uniform(0.25, 0.6))

print(f"Needed crawl: {need_crawl_count}")
print(f"Skipped (already True): {skipped_already_true}")
print(f"Succeeded:    {success}")
print(f"Failed:       {failed}")


[0] Hellper/صفحه تمرین  → searching images with query: Hellper/صفحه تمرین …
  ⚠️ No candidates found by any provider.

[1] Jorvan Vieira  ▶︎ already downloaded — skipping

[2] آتسو استویکوف  ▶︎ already downloaded — skipping

[3] آتنا محمدی  ▶︎ already downloaded — skipping

[4] آتوسا پورکاشیان  ▶︎ already downloaded — skipping

[5] آتوسا گلشادنژاد  → searching images with query: آتوسا گلشادنژاد …
  ⚠️ No candidates found by any provider.

[6] آتیلا حجازی  ▶︎ already downloaded — skipping

[7] آدام جمیلی  ▶︎ already downloaded — skipping

[8] آدام همتی  ▶︎ already downloaded — skipping

[9] آدریانو آلوز  ▶︎ already downloaded — skipping

[10] آرا هاکوبیان   ▶︎ already downloaded — skipping

[11] آرارات آراکلیان  → searching images with query: آرارات آراکلیان …
  ⚠️ No candidates found by any provider.

[12] آرامائیس تونویان  → searching images with query: آرامائیس تونویان …
  ⚠️ No candidates found by any provider.

[13] آرتور میناسیان  → searching images with query: آرتور میناسیان …
 



  ✅ Candidate 1/3 OK → 1737_فاطمه_نعمتی_crawled.jpg

[1738] فاطمه همتی  ▶︎ already downloaded — skipping

[1739] فاطمه چالاکی  → searching images with query: فاطمه چالاکی …
  ⚠️ No candidates found by any provider.

[1740] فاطمه کرم‌زاده  ▶︎ already downloaded — skipping

[1741] فاطمه یاوری  → searching images with query: فاطمه یاوری …
  ⚠️ No candidates found by any provider.

[1742] فایز الرشیدی  ▶︎ already downloaded — skipping

[1743] فتح‌الله فریدی وثوق  → searching images with query: فتح‌الله فریدی وثوق …
  ⚠️ No candidates found by any provider.

[1744] فراز امامعلی  ▶︎ already downloaded — skipping

[1745] فراز امام‌علی  ▶︎ already downloaded — skipping

[1746] فراز فاطمی  → searching images with query: فراز فاطمی …
  ℹ️ 1 candidate(s) found
  ✅ Candidate 1/1 OK → 1746_فراز_فاطمی_crawled.jpg

[1747] فراز کمالوند  ▶︎ already downloaded — skipping

[1748] فرامرز آصف  ▶︎ already downloaded — skipping

[1749] فرامرز ظلی  ▶︎ already downloaded — skipping

[1750] فرانچسکو بریبانتی  ▶



  ✅ Candidate 1/10 OK → 2210_محمدرضا_اکبری_crawled.jpg

[2211] محمدرضا اکبریان  ▶︎ already downloaded — skipping

[2212] محمدرضا براری  ▶︎ already downloaded — skipping

[2213] محمدرضا بردبار  ▶︎ already downloaded — skipping

[2214] محمدرضا توپچی  → searching images with query: محمدرضا توپچی …
  ℹ️ 1 candidate(s) found
  ✅ Candidate 1/1 OK → 2214_محمدرضا_توپچی_crawled.jpg

[2215] محمدرضا حاج‌یوسف‌زاده  → searching images with query: محمدرضا حاج‌یوسف‌زاده …
  ⚠️ No candidates found by any provider.

[2216] محمدرضا حسینی  ▶︎ already downloaded — skipping

[2217] محمدرضا حضرت‌پور  ▶︎ already downloaded — skipping

[2218] محمدرضا خالدآبادی  ▶︎ already downloaded — skipping

[2219] محمدرضا خانزاده  ▶︎ already downloaded — skipping

[2220] محمدرضا خرسندنیا  ▶︎ already downloaded — skipping

[2221] محمدرضا خلعتبری  ▶︎ already downloaded — skipping

[2222] محمدرضا خیرالله زاده  → searching images with query: محمدرضا خیرالله زاده …
  ℹ️ 1 candidate(s) found
  ❌ Candidate 1/1 FAILED
  ▶︎ RESULT

In [78]:
PATH = Path("athlete_merged_with_bios.json")

with PATH.open("r", encoding="utf-8") as f:
    data = json.load(f)

true_count = 0
false_count = 0
missing_count = 0

for rec in data:
    val = rec.get("is_image_downloaded")
    if val is True:
        true_count += 1
    elif val is False:
        false_count += 1
    else:
        missing_count += 1

total = len(data)
print("\n=== is_image_downloaded status ===")
print(f"File: {PATH.name}")
print(f"Total records: {total}")
print(f"True : {true_count}")
print(f"False: {false_count}")
print(f"Missing/None: {missing_count}")
print(f"Checked sum: {true_count + false_count + missing_count} (should equal total)")


=== is_image_downloaded status ===
File: athlete_merged_with_bios.json
Total records: 3082
True : 0
False: 0
Missing/None: 3082
Checked sum: 3082 (should equal total)


In [79]:
import os
import re
import json
from pathlib import Path

INPUT_JSON  = Path("athlete_merged_with_bios.json")
IMG_DIR     = Path("athlete_images")
OUTPUT_JSON = Path("athlete_merged_with_bios_imaged_crawled.json")

def sanitize_filename(s: str) -> str:
    s = (str(s) or "").strip()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^\w\-.]+", "", s)
    return s or "image"

# --- load data ---
with INPUT_JSON.open("r", encoding="utf-8") as f:
    records = json.load(f)

# --- gather image filenames once (lowercased for case-insensitive contains) ---
if IMG_DIR.exists():
    image_files_lower = [p.name.lower() for p in IMG_DIR.iterdir() if p.is_file()]
else:
    print(f"⚠️  Images folder does not exist: {IMG_DIR} (treating as empty)")
    image_files_lower = []

# --- process records ---
out = []
true_count = 0
false_count = 0

print("\n" + "="*80)
print(f"Scanning images in: {IMG_DIR}")
print(f"Input JSON: {INPUT_JSON.name}")
print("="*80)

for idx, rec in enumerate(records):
    name = rec.get("name") or f"record_{idx}"
    safe_name = sanitize_filename(name).lower()

    # check if any filename includes the safe_name
    has_image = any(safe_name in fname for fname in image_files_lower)

    rec_out = dict(rec)
    rec_out["is_image_downloaded"] = bool(has_image)
    out.append(rec_out)

    if has_image:
        true_count += 1
        print(f"[{idx}] {name} → ✅ found file containing '{safe_name}'")
    else:
        false_count += 1
        print(f"[{idx}] {name} → ❌ no file containing '{safe_name}'")

# --- write output ---
with OUTPUT_JSON.open("w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

# --- summary ---
total = len(records)
print("\n" + "-"*80)
print(f"Output JSON: {OUTPUT_JSON.name}")
print(f"Total records: {total}")
print(f"is_image_downloaded = True : {true_count}")
print(f"is_image_downloaded = False: {false_count}")
print(f"Checked sum: {true_count + false_count} (should equal total)")
print("-"*80)


Scanning images in: athlete_images
Input JSON: athlete_merged_with_bios.json
[0] Hellper/صفحه تمرین → ❌ no file containing 'hellperصفحه_تمرین'
[1] Jorvan Vieira → ✅ found file containing 'jorvan_vieira'
[2] آتسو استویکوف → ✅ found file containing 'آتسو_استویکوف'
[3] آتنا محمدی → ✅ found file containing 'آتنا_محمدی'
[4] آتوسا پورکاشیان → ✅ found file containing 'آتوسا_پورکاشیان'
[5] آتوسا گلشادنژاد → ❌ no file containing 'آتوسا_گلشادنژاد'
[6] آتیلا حجازی → ✅ found file containing 'آتیلا_حجازی'
[7] آدام جمیلی → ✅ found file containing 'آدام_جمیلی'
[8] آدام همتی → ✅ found file containing 'آدام_همتی'
[9] آدریانو آلوز → ✅ found file containing 'آدریانو_آلوز'
[10] آرا هاکوبیان  → ✅ found file containing 'آرا_هاکوبیان'
[11] آرارات آراکلیان → ❌ no file containing 'آرارات_آراکلیان'
[12] آرامائیس تونویان → ❌ no file containing 'آرامائیس_تونویان'
[13] آرتور میناسیان → ❌ no file containing 'آرتور_میناسیان'
[14] آرتور یدیگاریان → ✅ found file containing 'آرتور_یدیگاریان'
[15] آرتوش آساطوریان → ❌ n

In [80]:
INPUT_JSON = Path("athlete_merged_with_bios_imaged_crawled.json")
IMG_DIR    = Path("athlete_images")

# Set to True if you want to preview without deleting
DRY_RUN = False

def sanitize_filename(s: str) -> str:
    s = (str(s) or "").strip()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^\w\-.]+", "", s)
    return s or "image"

def find_image_number(fname: str) -> int:
    """
    Extract number after '_image' if present, e.g. '_image3' -> 3; else large number.
    """
    m = re.search(r"_image(\d+)\b", fname)
    return int(m.group(1)) if m else 10**9

def keep_priority_key(p: Path) -> tuple:
    """
    Sort key for deciding which image to KEEP.
    Lower tuple sorts first (higher priority).
    """
    name = p.name.lower()
    return (
        0 if "_crawled" in name else 1,        # prefer crawled
        find_image_number(name),               # then prefer lower image number
        0 if "_image." in name else 1,         # then prefer plain _image.jpg
        name                                   # final tie-breaker
    )

# --- load records ---
with INPUT_JSON.open("r", encoding="utf-8") as f:
    records = json.load(f)

if not IMG_DIR.exists():
    print(f"⚠️  Images folder does not exist: {IMG_DIR}. Nothing to clean.")
else:
    all_files = [p for p in IMG_DIR.iterdir() if p.is_file()]
    all_files_lower = {p.name.lower(): p for p in all_files}  # map for quick access

deleted_files = []
kept_map = {}  # record index -> kept filename (for logs)
dupe_records = 0
no_image_records = 0
already_singleton = 0

print("\n" + "="*92)
print("Deduplicating athlete_images based on safe_name from athlete_merged_with_bios_imaged_crawled.json")
print("="*92)

for idx, rec in enumerate(records):
    name = rec.get("name") or f"record_{idx}"
    safe_name = sanitize_filename(name).lower()

    # Gather matches by substring of safe_name in filename (case-insensitive)
    matches = [p for p in all_files if safe_name in p.name.lower()]

    if len(matches) == 0:
        no_image_records += 1
        print(f"[{idx}] {name} → ❌ no matching images")
        continue

    if len(matches) == 1:
        already_singleton += 1
        kept_map[idx] = matches[0].name
        print(f"[{idx}] {name} → ✅ 1 image: keep {matches[0].name}")
        continue

    # More than one: choose one to keep, delete others
    dupe_records += 1
    matches_sorted = sorted(matches, key=keep_priority_key)
    keep = matches_sorted[0]
    to_delete = matches_sorted[1:]
    kept_map[idx] = keep.name

    print(f"[{idx}] {name} → ⚠️ {len(matches)} images found")
    print(f"    keep   : {keep.name}")
    if to_delete:
        for f in to_delete:
            print(f"    delete : {f.name}")
            if not DRY_RUN:
                try:
                    os.remove(f)
                    deleted_files.append(f.name)
                except Exception as e:
                    print(f"      → deletion failed for {f.name}: {e}")

# --- report ---
print("\n" + "-"*92)
print("DEDUP REPORT")
print("-"*92)
total = len(records)
print(f"Total records examined:          {total}")
print(f"Records with no images:          {no_image_records}")
print(f"Records with exactly 1 image:    {already_singleton}")
print(f"Records with duplicates cleaned: {dupe_records}")
print(f"Total files deleted:             {len(deleted_files)}")
if deleted_files:
    sample = deleted_files[:15]
    print(f"Deleted (sample up to 15):       {sample}")
print(f"Dry run mode:                    {DRY_RUN}")
print("-"*92)


Deduplicating athlete_images based on safe_name from athlete_merged_with_bios_imaged_crawled.json
[0] Hellper/صفحه تمرین → ❌ no matching images
[1] Jorvan Vieira → ✅ 1 image: keep 1_Jorvan_Vieira_image1.jpg
[2] آتسو استویکوف → ⚠️ 2 images found
    keep   : 2_آتسو_استویکوف_crawled.jpg
    delete : 2_آتسو_استویکوف_image.jpg
[3] آتنا محمدی → ✅ 1 image: keep 3_آتنا_محمدی_image.jpg
[4] آتوسا پورکاشیان → ✅ 1 image: keep 4_آتوسا_پورکاشیان_image.jpg
[5] آتوسا گلشادنژاد → ❌ no matching images
[6] آتیلا حجازی → ✅ 1 image: keep 6_آتیلا_حجازی_image.jpg
[7] آدام جمیلی → ✅ 1 image: keep 7_آدام_جمیلی_image.jpg
[8] آدام همتی → ✅ 1 image: keep 8_آدام_همتی_image.jpg
[9] آدریانو آلوز → ✅ 1 image: keep 9_آدریانو_آلوز_image.jpg
[10] آرا هاکوبیان  → ✅ 1 image: keep 10_آرا_هاکوبیان_crawled.jpg
[11] آرارات آراکلیان → ❌ no matching images
[12] آرامائیس تونویان → ❌ no matching images
[13] آرتور میناسیان → ❌ no matching images
[14] آرتور یدیگاریان → ✅ 1 image: keep 14_آرتور_یدیگاریان_image.jpg
[15] آرتوش آساطو