In [6]:
import os, os.path
from PIL import Image
import requests
from io import BytesIO
import xml.etree.ElementTree as ET
import tqdm
import pandas as pd
import numpy as np
import imagehash

SCRAPE_DIR = '.scrape_output'

### Steps to fetch the xml files
1. Install `pip install oaiharvest`
2. Create a folder in this repo called '.scrape_output'
3. `cd .scrape_output`
4. Run the following command: `oai-harvest --set=Beelddocument --metadataPrefix=ese https://denbosch.hosting.deventit.net/atlantispubliek/oai.axd` (This may take a while)

### Test code to get the format of the xml file

In [12]:
tree = ET.parse('.scrape_output/14444813.ese.xml')
root = tree.getroot()

items = []

for child in root:
    prefix, has_namespace, postfix = child.tag.partition('}')
    items.append({
        'key': postfix,
        'value': child.text
    })

pd.DataFrame(items)

Unnamed: 0,key,value
0,title,Titelblad van de in het bedrijf gedrukte bijbe...
1,creator,Onbekend
2,subject,Steegjes
3,subject,Drukkerijen
4,coverage,'s-Hertogenbosch;Achter de Exters
5,publisher,Stadsarchief 's-Hertogenbosch
6,publisher,http://www.stadsarchief.nl/
7,date,1894
8,type,Fotocopie
9,hasFormat,http://denbosch.hosting.deventit.net/HttpHandl...


### Create a list of the XML files

In [14]:
image_xml_files = [SCRAPE_DIR + '/' + name for name in os.listdir(SCRAPE_DIR)]
image_xml_files[:3]

['.scrape_output/14452242.ese.xml',
 '.scrape_output/14754600.ese.xml',
 '.scrape_output/14734482.ese.xml']

#### Loop through the images and transform them into a dictionaries

In [25]:
images = []

# xml namespace
ns = {
    'dc': 'http://purl.org/dc/elements/1.1/',
    'europeana': 'http://www.europeana.eu/schemas/ese/',
    'dcterms': 'http://purl.org/dc/terms/'
}

# No image hash
no_img = '00ff6767777f8300'

for image_xml in image_xml_files[:3]:
    # Parse as xml
    tree = ET.parse(image_xml)
    
    # Get the root of the tree
    root = tree.getroot()
    
    # Dictionary to store the image attributes
    image_attribs = {}
    
    # Image url
    image_url = root.find('europeana:object', ns).text
    
    # Check if the image exists
#     response = requests.get(image_url)
#     image = Image.open(BytesIO(response.content))
#     hasj = str(imagehash.average_hash(image))
    
#     # If there is no image
#     if hasj == no_img:
#         continue

#     # Size of the image
#     image_attribs['width'], image_attribs['height'] = image.size
    
    # Image url
    image_attribs['url'] = image_url
    
    # Get the title
    image_attribs['title'] = root.find('dc:title', ns).text.strip()
    
    # Location
    coverage = root.find('dc:coverage', ns)
    
    if coverage != None:
        coverage = coverage.text.split(';')
        
        # Don't store the image if it is not located in Den Bosch
        if coverage[0] != "'s-Hertogenbosch":
            continue
            
        street = coverage[1]
        image_attribs['street'] = street
            
    # Date
    image_attribs['date'] = root.find('dc:date', ns).text
    
    # Medium
    medium = root.find('dcterms:medium', ns)
    
    # Make sure the image is a photo
    if medium.text != 'Foto':
        continue
    
    # Add to list
    images.append(image_attribs)

In [26]:
df = pd.DataFrame(images)
df

Unnamed: 0,date,street,title,url
0,25/2/1952,Sint Jacobskerkhof,Carnaval. Groep leden van de oudste carnavalsv...,http://denbosch.hosting.deventit.net/HttpHandl...
1,8/2/1934,,RK Huishoud- en Industrieschool Marienburg. te...,http://denbosch.hosting.deventit.net/HttpHandl...
