In [1]:
import os, os.path
from PIL import Image
import requests
from io import BytesIO
import xml.etree.ElementTree as ET
import tqdm
import pandas as pd
import numpy as np
import imagehash

SCRAPE_DIR = '.scrape_output'

### Steps to fetch the xml files
1. Install `pip install oaiharvest`
2. Create a folder in this repo called '.scrape_output'
3. `cd .scrape_output`
4. Run the following command: `oai-harvest --set=Beelddocument --metadataPrefix=ese https://denbosch.hosting.deventit.net/atlantispubliek/oai.axd` (This may take a while)

### Test code to get the format of the xml file

In [2]:
tree = ET.parse('.scrape_output/14433440.ese.xml')
root = tree.getroot()

items = []

for child in root:
    prefix, has_namespace, postfix = child.tag.partition('}')
    items.append({
        'key': postfix,
        'value': child.text
    })

pd.DataFrame(items)

Unnamed: 0,key,value
0,title,Heien van de eerste paal voor de woningen voor...
1,creator,"Janssens, Felix"
2,subject,Exterieur
3,subject,Woningbouw
4,subject,Nieuwbouw
5,coverage,'s-Hertogenbosch;Schaarhuispad
6,publisher,Stadsarchief 's-Hertogenbosch
7,publisher,http://www.stadsarchief.nl/
8,date,10/3/1987
9,type,Foto


### Create a list of the XML files

In [3]:
image_xml_files = [SCRAPE_DIR + '/' + name for name in os.listdir(SCRAPE_DIR)]
image_xml_files[:3]

['.scrape_output/104915350.ese.xml',
 '.scrape_output/104915448.ese.xml',
 '.scrape_output/104915560.ese.xml']

#### Loop through the images and transform them into a dictionaries

In [176]:
images = []

# xml namespace
ns = {
    'dc': 'http://purl.org/dc/elements/1.1/',
    'europeana': 'http://www.europeana.eu/schemas/ese/',
    'dcterms': 'http://purl.org/dc/terms/'
}

# No image hash
no_img = '00ff6767777f8300'

for image_xml in image_xml_files[0:3]:
    # Parse as xml
    tree = ET.parse(image_xml)
    
    # Get the root of the tree
    root = tree.getroot()
    
    # Dictionary to store the image attributes
    image_attribs = {}
    
    # Image url
    image_url = root.find('europeana:object', ns).text
    
    # Check if the image exists
#     response = requests.get(image_url)
#     image = Image.open(BytesIO(response.content))
#     hasj = str(imagehash.average_hash(image))
    
#     # If there is no image
#     if hasj == no_img:
#         continue

#     # Size of the image
#     image_attribs['width'], image_attribs['height'] = image.size
    
    # Image url
    image_attribs['url'] = image_url
    
    # Get the title
    image_attribs['title'] = root.find('dc:title', ns).text.strip()
    
    # Location
    coverage = root.find('dc:coverage', ns)
    
    if coverage != None:
        coverage = coverage.text.split(';')
        
        # Don't store the image if it is not located in Den Bosch
        if coverage[0] != "'s-Hertogenbosch":
            continue
            
        street = coverage[1]
        image_attribs['street'] = street
            
    # Date
    if root.find('dc:date', ns).text == None or len(root.find('dc:date', ns).text[-4:]) < 4 or '/' in root.find('dc:date', ns).text[-4:]:
        image_attribs['year'] = None
    else: 
        image_attribs['year'] = root.find('dc:date', ns).text[-4:]
        
    # Medium
    medium = root.find('dcterms:medium', ns)
    
    # Make sure the image is a photo
    if medium == None:
        continue
    elif medium.text != 'Foto':
        continue
    
    # Add to list
    images.append(image_attribs)

In [178]:
df = pd.DataFrame(images)
df.head()

Unnamed: 0,street,title,url,year
0,Muntelplein,Arbeiderswoningen van woningbouwvereniging mgr...,http://denbosch.hosting.deventit.net/HttpHandl...,1930
1,,Arbeiderswoningen van woningbouwvereniging mgr...,http://denbosch.hosting.deventit.net/HttpHandl...,1930
2,,Arbeiderswoningen van woningbouwvereniging mgr...,http://denbosch.hosting.deventit.net/HttpHandl...,1930


In [40]:
# Import dataframe into MySQL
import sqlalchemy
database_username = 'remote'
database_password = 'EtrPCEc0jt'
database_ip       = '165.22.199.122'
database_name     = 'scavenger'
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))
df.to_sql(con=database_connection, name='images', if_exists='replace')

In [210]:
street_names = df['street'].unique()

In [214]:
import requests
import json

# HERE app_id & app_code
APP_ID = 'oidKwC98hLcIQBaboIwS'
APP_CODE = 'G6gKSvk45y4fF14hwpIeYg'

# Function that collects multiple location-coordinates of a single street
def get_location(streetname):
    if streetname == '':
        return None
    url = 'https://geocoder.api.here.com/6.2/geocode.json?app_id=' + APP_ID + '&app_code=' + APP_CODE + '&searchtext='

    r = requests.get(url + streetname + ", 's-Hertogenbosch").json()
    
    if r['Response']['View'] != []:
        #DisplayPosition
        location_1 = r['Response']['View'][0]['Result'][0]['Location']['DisplayPosition']
    
        #NavigationPosition
        location_2 = r['Response']['View'][0]['Result'][0]['Location']['NavigationPosition'][0]
    
        #MapView
        location_3_A = r['Response']['View'][0]['Result'][0]['Location']['MapView']['TopLeft']
        
        location_3_B = r['Response']['View'][0]['Result'][0]['Location']['MapView']['BottomRight']

        return ((location_1['Latitude'], location_1['Longitude']), (location_2['Latitude'], location_2['Longitude']), 
                (location_3_A['Latitude'], location_3_A['Longitude']), (location_3_B['Latitude'], location_3_B['Longitude']))
    
    else:
        return None

In [225]:
# All streetnames present in the dataframe
street_names = df['street'].unique()

# Create test street
test_street = street_names[0]
test_street

'Muntelplein'

In [226]:
test_result = get_location(test_street)
test_result

((51.69426, 5.30766),
 (51.69426, 5.30766),
 (51.69467, 5.30671),
 (51.6942, 5.3077))

In [220]:
# Create list with coordinates for every street in the dataframe
coordinates = [get_location(street) for street in df['street']]

# Insert the 'coordinates' list into the dataframe
df.insert(1, 'coordinates', coordinates, False)

In [223]:
df.head()

Unnamed: 0,street,coordinates,title,url,year
0,Muntelplein,"((51.69426, 5.30766), (51.69426, 5.30766), (51...",Arbeiderswoningen van woningbouwvereniging mgr...,http://denbosch.hosting.deventit.net/HttpHandl...,1930
1,,,Arbeiderswoningen van woningbouwvereniging mgr...,http://denbosch.hosting.deventit.net/HttpHandl...,1930
2,,,Arbeiderswoningen van woningbouwvereniging mgr...,http://denbosch.hosting.deventit.net/HttpHandl...,1930
