# Portrait Art Dataset — Full Pipeline (English, Self‑Contained)

MSc Final Project — **Embodied Aesthetic Reconstruction**  
This single notebook builds a **high‑quality Portrait Art Dataset** using The Met public API and a curated Tate CSV. 
It defines schema validation, portrait/face/license filtering, exports CSV/JSONL, and optionally downloads images.

**Suggested location:** put this file at the root of your `EmbodiedAestheticReconstruction/` repo.
Outputs will be saved under `data/` next to this notebook.

Badge for GitHub/Colab usage (replace `YOUR_USER` & `YOUR_REPO` after you upload):  
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/YOUR_USER/YOUR_REPO/blob/main/Portrait_Art_Dataset_Full_Pipeline.ipynb)

## 0. Setup (install once if needed)
If you are on Colab or a fresh environment, run this cell. Otherwise, install from your `requirements.txt` once.

In [None]:
%%bash
python -V || true
pip -V || true
pip install -q --upgrade pip
pip install -q requests pandas pydantic opencv-python tqdm python-slugify numpy PyYAML pillow

## 1. Imports & Paths

In [None]:
from __future__ import annotations
import os, io, json, yaml, random, time, glob
from pathlib import Path
from typing import Optional, List, Iterable, Dict, Any
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import cv2

ROOT = Path.cwd()                            # notebook location
DATA_DIR = ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
IMAGES_DIR = DATA_DIR / 'images'
INTERIM_DIR = DATA_DIR / 'interim'
for d in [DATA_DIR, RAW_DIR, IMAGES_DIR, INTERIM_DIR]:
    d.mkdir(parents=True, exist_ok=True)
ROOT, DATA_DIR

## 2. Schema definition (Pydantic)

In [None]:
from pydantic import BaseModel, HttpUrl

class PortraitRecord(BaseModel):
    artwork_id: str
    source_api: str
    museum: str
    artist_name_en: str
    artwork_title_en: Optional[str] = None
    year: Optional[str] = None
    medium: Optional[str] = None
    dimensions: Optional[str] = None
    style_period: Optional[str] = None
    subject_persons: Optional[List[str]] = None
    is_portrait: bool = True
    image_url: Optional[HttpUrl] = None
    thumbnail_url: Optional[HttpUrl] = None
    license: Optional[str] = None
    credit_line: Optional[str] = None
    width_px: Optional[int] = None
    height_px: Optional[int] = None
    notes: Optional[str] = None
    class Config:
        extra = 'ignore'

## 3. Utilities (portrait heuristics, license, face detection)

In [None]:
USER_AGENT = "PortraitArtDataset/1.0 (+https://example.org)"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": USER_AGENT})

_PORTRAIT_KEYWORDS = ["portrait", "self-portrait", "self portrait"]
_FACE = None

def _face():
    global _FACE
    if _FACE is None:
        _FACE = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
    return _FACE

def looks_like_portrait(title: str, classification: Optional[str] = None) -> bool:
    s = ((title or '') + ' ' + (classification or '')).lower()
    return any(k in s for k in _PORTRAIT_KEYWORDS)

def license_ok(text: Optional[str], require_public: bool = True) -> bool:
    if not text:
        return not require_public
    return ("public domain" in text.lower()) or ("cc0" in text.lower()) if require_public else True

def _safe_get(url: str, timeout: int = 20) -> Optional[bytes]:
    try:
        r = SESSION.get(url, timeout=timeout)
        if r.status_code == 200:
            return r.content
    except Exception:
        return None
    return None

def face_area_ratio_from_url(url: str) -> float:
    data = _safe_get(url)
    if not data:
        return 0.0
    arr = np.frombuffer(data, dtype=np.uint8)
    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    if img is None:
        return 0.0
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = _face().detectMultiScale(gray, 1.1, 5, minSize=(40, 40))
    h, w = gray.shape[:2]
    if w*h <= 0:
        return 0.0
    return float(sum(fw*fh for (_,_,fw,fh) in faces) / (w*h))

def passes_face_check(img_url: Optional[str], min_ratio: float = 0.01) -> bool:
    if not img_url:
        return False
    try:
        return face_area_ratio_from_url(img_url) >= min_ratio
    except Exception:
        return False

## 4. PortraitFilter (metadata + license + optional face check)

In [None]:
class PortraitFilter:
    def __init__(self, require_public_domain: bool = True, use_face_detection: bool = True):
        self.require_public_domain = require_public_domain
        self.use_face_detection = use_face_detection
    def keep(self, rec: dict) -> bool:
        title_ok = looks_like_portrait(rec.get('artwork_title_en',''), rec.get('classification'))
        subject_ok = bool(rec.get('subject_persons') or []) or title_ok
        if not license_ok(rec.get('license'), self.require_public_domain):
            return False
        if self.use_face_detection:
            url = rec.get('image_url')
            if not url or not passes_face_check(url, 0.01):
                return False
        return subject_ok

## 5. Fetchers — The Met API

In [None]:
MET_BASE = "https://collectionapi.metmuseum.org/public/collection/v1"
MET_TERMS = ["portrait", "self-portrait", "self portrait"]

def met_search_ids(artist: str) -> List[int]:
    ids = set()
    for q in MET_TERMS:
        r = SESSION.get(f"{MET_BASE}/search", params={"q": f"{q} {artist}", "hasImages": "true"}, timeout=30)
        if r.status_code == 200:
            for oid in (r.json().get('objectIDs') or []):
                try:
                    ids.add(int(oid))
                except Exception:
                    pass
    return list(ids)

def met_fetch_object(oid: int) -> Dict[str, Any]:
    r = SESSION.get(f"{MET_BASE}/objects/{oid}", timeout=30)
    r.raise_for_status()
    return r.json()

def met_yield_records(artist_en: str) -> Iterable[dict]:
    for oid in met_search_ids(artist_en):
        try:
            o = met_fetch_object(oid)
        except Exception:
            continue
        yield {
            'artwork_id': f'met_{oid}',
            'source_api': 'met',
            'museum': 'The Metropolitan Museum of Art',
            'artist_name_en': o.get('artistDisplayName') or artist_en,
            'artwork_title_en': o.get('title'),
            'year': o.get('objectDate'),
            'medium': o.get('medium'),
            'dimensions': o.get('dimensions'),
            'style_period': o.get('period'),
            'subject_persons': o.get('tags') or None,
            'image_url': o.get('primaryImage') or o.get('primaryImageSmall'),
            'thumbnail_url': o.get('primaryImageSmall') or None,
            'license': 'Public Domain' if o.get('isPublicDomain') else None,
            'credit_line': o.get('creditLine'),
            'notes': o.get('objectName'),
            'classification': o.get('classification'),
        }

## 6. Fetchers — Tate Local CSV (hand‑picked)

In [None]:
def _split_subjects(x):
    if pd.isna(x) or x is None:
        return None
    s = str(x)
    if ';' in s:
        return [t.strip() for t in s.split(';') if t.strip()]
    if ',' in s:
        return [t.strip() for t in s.split(',') if t.strip()]
    return [s.strip()] if s.strip() else None

def tate_yield_records(csv_path: str) -> Iterable[dict]:
    df = pd.read_csv(csv_path)
    req = ["artistName","title","year","medium","dimensions","imageUrl","thumbnailUrl"]
    for col in req:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    for i, row in df.iterrows():
        yield {
            'artwork_id': f'tate_{int(i)}',
            'source_api': 'tate_local',
            'museum': 'Tate Britain',
            'artist_name_en': str(row.get('artistName','')).strip(),
            'artwork_title_en': row.get('title'),
            'year': row.get('year'),
            'medium': row.get('medium'),
            'dimensions': row.get('dimensions'),
            'style_period': row.get('style_period', None),
            'subject_persons': _split_subjects(row.get('subjectPersons')),
            'image_url': row.get('imageUrl'),
            'thumbnail_url': row.get('thumbnailUrl'),
            'license': row.get('license'),
            'credit_line': row.get('creditLine'),
            'notes': row.get('notes'),
            'classification': row.get('classification'),
            'is_portrait': bool(row.get('isPortrait', True)),
        }

## 7. Pipeline (combine sources → filter → validate → export)

In [None]:
from pydantic import ValidationError

class Pipeline:
    def __init__(self, config: dict):
        self.config = config
        opts = config.get('options', {})
        self.filter = PortraitFilter(
            require_public_domain=bool(opts.get('require_public_domain', True)),
            use_face_detection=bool(opts.get('use_face_detection', True)),
        )

    def _enabled(self, key: str) -> bool:
        for m in self.config.get('museums', []):
            if m.get('key') == key:
                return bool(m.get('enabled', False))
        return False

    def run(self, out_csv: str, out_jsonl: str):
        records: List[dict] = []
        # MET
        if self._enabled('met'):
            for a in self.config.get('artists', []):
                for rec in met_yield_records(a['en']):
                    if self.filter.keep(rec):
                        records.append(rec)
        # Tate local
        if self._enabled('tate_local'):
            csv_path = str(RAW_DIR / 'tate_selected.csv')
            if os.path.exists(csv_path):
                for rec in tate_yield_records(csv_path):
                    if self.filter.keep(rec):
                        records.append(rec)
        # Validate
        valid = []
        for r in records:
            try:
                valid.append(PortraitRecord(**r).model_dump())
            except ValidationError:
                pass
        # Export
        pd.DataFrame(valid).to_csv(out_csv, index=False)
        with open(out_jsonl, 'w', encoding='utf-8') as f:
            for row in valid:
                f.write(json.dumps(row, ensure_ascii=False) + '\n')
        return valid

## 8. Load or create `artists.yaml`

In [None]:
cfg_path = ROOT / 'artists.yaml'
if not cfg_path.exists():
    sample_cfg = {
        'museums': [
            {'name': 'The Metropolitan Museum of Art', 'key': 'met', 'enabled': True},
            {'name': 'Tate Britain (Local CSV)', 'key': 'tate_local', 'enabled': False},
        ],
        'artists': [
            {'en': 'Jean-Auguste-Dominique Ingres'},
            {'en': 'Thomas Gainsborough'},
        ],
        'options': {'min_image_width': 600, 'require_public_domain': True, 'use_face_detection': True}
    }
    with open(cfg_path, 'w', encoding='utf-8') as f:
        yaml.safe_dump(sample_cfg, f, allow_unicode=True, sort_keys=False)
with open(cfg_path, 'r', encoding='utf-8') as f:
    CONFIG = yaml.safe_load(f)
CONFIG

## 9. Run pipeline → export CSV/JSONL

In [None]:
out_csv = str(INTERIM_DIR / 'portrait_art_dataset.csv')
out_jsonl = str(INTERIM_DIR / 'portrait_art_dataset.jsonl')
pipe = Pipeline(CONFIG)
rows = pipe.run(out_csv, out_jsonl)
print(f"Saved: {out_csv} | {out_jsonl} | rows={len(rows)}")

## 10. Preview dataset

In [None]:
df = pd.read_csv(out_csv) if os.path.exists(out_csv) else pd.DataFrame()
df.head(10)

## 11. (Optional) Download images to `data/images/`

In [None]:
from slugify import slugify

def download(url: str, path: Path) -> bool:
    try:
        r = SESSION.get(url, timeout=30)
        if r.status_code == 200:
            with open(path, 'wb') as f:
                f.write(r.content)
            return True
    except Exception:
        return False
    return False

download_count = 0
if os.path.exists(out_jsonl):
    with open(out_jsonl, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=len(open(out_jsonl, 'r', encoding='utf-8').read().splitlines())):
            row = json.loads(line)
            url = row.get('image_url')
            if not url:
                continue
            name = slugify(f"{row.get('source_api')}_{row.get('artist_name_en')}_{row.get('artwork_title_en')}")
            out_path = IMAGES_DIR / f"{name}.jpg"
            if out_path.exists():
                continue
            if download(url, out_path):
                download_count += 1
download_count

## 12. Show random downloaded images

In [None]:
jpgs = list(IMAGES_DIR.glob('*.jpg'))
sample = random.sample(jpgs, min(6, len(jpgs))) if jpgs else []
for p in sample:
    display(Image.open(p))
len(jpgs)

## 13. Quick stats

In [None]:
if not df.empty:
    print('By museum:')
    display(df['museum'].value_counts().to_frame('count'))
    print('By artist:')
    display(df['artist_name_en'].value_counts().head(10).to_frame('count'))
else:
    print('No data yet. Try re-running the pipeline or check API connectivity.')

## 14. Next steps (connect with matcher)
- Use `data/interim/portrait_art_dataset.jsonl` + `data/images/` in your *match-only* system.
- Example (run from the matcher repo):
```bash
python indexing/build_index.py \
  --dataset_jsonl /absolute/path/to/EmbodiedAestheticReconstruction/data/interim/portrait_art_dataset.jsonl \
  --images_dir    /absolute/path/to/EmbodiedAestheticReconstruction/data/images
```

**Tip:** Keep code/artifacts (CSV/JSONL) in Git; keep large image files out of Git (use `.gitignore`).