In [1]:
# Install required packages
%pip install datasets==3.1.0 tqdm==4.67.1 pillow==10.4.0 requests==2.32.3 transformers==4.44.2

import os

# Create folder structure
folders = [
    'data/real/cars',
    'data/real/buildings', 
    'data/real/medical',
    'data/ai/cars',
    'data/ai/buildings',
    'data/ai/medical',
    'data/_extra'
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("✅ folders ready")

Note: you may need to restart the kernel to use updated packages.
✅ folders ready
Note: you may need to restart the kernel to use updated packages.
✅ folders ready


In [2]:
import requests, time, pathlib, json
from typing import List, Dict

TOPICS_REAL: Dict[str, List[str]] = {
    "cars": ["car", "automobile", "sedan", "sports car"],
    "buildings": ["building exterior", "house", "skyscraper", "apartment building"],
    "medical": ["hospital interior", "medical equipment", "stethoscope", "x-ray machine"],
}
TARGET_PER_TOPIC = 120  # bump later as needed
SAVE_ROOT = pathlib.Path("data/real")
SESSION = requests.Session()
HEADERS = {"User-Agent": "synthetic-detector/0.1 (student project)"}

def fetch_commons_images(topic: str, queries: List[str], target: int) -> int:
    dest = SAVE_ROOT / topic
    dest.mkdir(parents=True, exist_ok=True)
    downloaded = 0
    seen = set()
    for q in queries:
        if downloaded >= target: break
        cont = {}
        while downloaded < target:
            params = {
                "action": "query",
                "generator": "search",
                "gsrsearch": q,
                "gsrlimit": 50,
                "gsrnamespace": 6,     # restrict search to file namespace so we get actual images
                "prop": "imageinfo",
                "iiprop": "url|mime|extmetadata",
                "iiurlwidth": 1200,
                "format": "json",
            }
            params.update(cont)
            r = SESSION.get("https://commons.wikimedia.org/w/api.php",
                            params=params, headers=HEADERS, timeout=30)
            r.raise_for_status()
            data = r.json()
            pages = (data.get("query") or {}).get("pages") or {}
            if not pages and not data.get("continue"): break

            for _, p in pages.items():
                ii = p.get("imageinfo")
                if not ii: continue
                meta = ii[0]
                url = meta.get("url") or meta.get("thumburl")
                mime = meta.get("mime","")
                if not url or ("jpeg" not in mime and not url.lower().endswith((".jpg",".jpeg"))):
                    continue
                if url in seen: 
                    continue
                seen.add(url)
                try:
                    img = SESSION.get(url, headers=HEADERS, timeout=60)
                    img.raise_for_status()
                    fname = f"{topic}_{downloaded:05d}.jpg"
                    (dest/fname).write_bytes(img.content)

                    prov = {
                        "source": "wikimedia_commons",
                        "query": q,
                        "url": url,
                        "title": p.get("title"),
                        "pageid": p.get("pageid"),
                        "mime": mime,
                    }
                    (dest/f"{fname}.json").write_text(json.dumps(prov, indent=2))
                    downloaded += 1
                    if downloaded >= target: break
                except Exception:
                    pass

            cont = data.get("continue") or {}
            if not cont: break
            time.sleep(0.25)
    return downloaded

tot = 0
for topic, queries in TOPICS_REAL.items():
    n = fetch_commons_images(topic, queries, TARGET_PER_TOPIC)
    print(f"[real/{topic}] downloaded: {n}")
    tot += n
print("✅ real total:", tot)

[real/cars] downloaded: 120
[real/buildings] downloaded: 120
[real/buildings] downloaded: 120
[real/medical] downloaded: 120
✅ real total: 360
[real/medical] downloaded: 120
✅ real total: 360


In [3]:
import os
os.getcwd()


'/workspaces/synthetic-detector/notebooks'

In [4]:
from pathlib import Path
import glob

def count(topic):
    return len(glob.glob(f"data/real/{topic}/*.jpg"))

print("cars     :", count("cars"))
print("buildings:", count("buildings"))
print("medical  :", count("medical"))


cars     : 120
buildings: 120
medical  : 120


In [None]:
import os

os.environ["HF_API_TOKEN"] = input("Enter your HuggingFace API token: ").strip()
