In [None]:
import os
import pandas as pd
import numpy as np
import requests
import tensorflow as tf
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt

In [None]:
# Define file paths and create directories
DATA_DIR = 'nga_data'
IMAGE_DIR = os.path.join(DATA_DIR, 'images')
PROCESSED_DIR = os.path.join(DATA_DIR, 'processed')

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Load datasets
objects = pd.read_csv(os.path.join(DATA_DIR, 'objects.csv'), low_memory=False)
images = pd.read_csv(os.path.join(DATA_DIR, 'published_images.csv'), low_memory=False)
terms = pd.read_csv(os.path.join(DATA_DIR, 'objects_terms.csv'), low_memory=False)

def filter_all_paintings(objects_df, images_df):
    # Standardize column names
    objects_df.columns = [col.lower() for col in objects_df.columns]
    images_df.columns = [col.lower() for col in images_df.columns]
    
    # Filter only paintings (no portrait restriction)
    paintings = objects_df[objects_df['classification'].str.contains('painting', case=False, na=False)]
    print(f"Found {len(paintings)} paintings in objects.csv")
    
    # Identify object ID column in images dataset
    id_column = 'depictstmsobjectid' if 'depictstmsobjectid' in images_df.columns else 'objectid'
    
    # Merge paintings with images
    painting_images = pd.merge(paintings, images_df, left_on='objectid', right_on=id_column, how='inner')
    
    # Filter for primary view images
    if 'viewtype' in painting_images.columns:
        painting_images = painting_images[painting_images['viewtype'] == 'primary']
    
    print(f"Final dataset contains {len(painting_images)} paintings with images")
    return painting_images

all_paintings_data = filter_all_paintings(objects, images)

def download_and_preprocess_images(df, target_size=(224, 224), max_images=None):
    if max_images is not None:
        df = df.head(max_images)
    processed_data = []

    for idx, row in df.iterrows():
        object_id = row['objectid']
        base_url = row['iiifurl'].rstrip("/")  
        image_url = f"{base_url}/full/600,/0/default.jpg"
        
        image_filename = f"{object_id}.jpg"
        image_path = os.path.join(IMAGE_DIR, image_filename)

        if os.path.exists(image_path):
            print(f"Image {image_filename} already exists, skipping download.")
        else:
            try:
                response = requests.get(image_url, timeout=10)
                response.raise_for_status()
                img = Image.open(BytesIO(response.content)).convert("RGB")
                img = img.resize(target_size)
                img.save(image_path)
                print(f"Saved image to {image_path}")
            except Exception as e:
                print(f"Error downloading image for object {object_id}: {e}")
                continue
        
        processed_data.append({
            'objectID': object_id,
            'title': row.get('title', 'Unknown'),
            'image_path': image_path
        })

    return pd.DataFrame(processed_data)


In [2]:
# Download and preprocess all paintings (set max_images to None for all)
processed_paintings = download_and_preprocess_images(all_paintings_data, max_images=None)
processed_paintings.to_csv(os.path.join(PROCESSED_DIR, 'all_paintings_dataset.csv'), index=False)

print("All NGA paintings dataset preparation complete!")


Found 4376 paintings in objects.csv
Final dataset contains 3941 paintings with images
Error downloading image for object 0: HTTPSConnectionPool(host='api.nga.gov', port=443): Max retries exceeded with url: /iiif/7b170a4c-9d44-475c-b294-cee6f43d88af/full/600,/0/default.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001758739AA10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error downloading image for object 1: HTTPSConnectionPool(host='api.nga.gov', port=443): Max retries exceeded with url: /iiif/7bbcfd01-e774-46e7-96d1-a3b03598cd8a/full/600,/0/default.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000017592BADB10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error downloading image for object 4: HTTPSConnectionPool(host='api.nga.gov', port=443): Max retries exceeded with url: /iiif/e6890364-4d99-4684-a62b-5060ee3c3446/full/600,/0/default.jpg (Caused b