In [None]:
import os, os.path
from PIL import Image
import requests
from io import BytesIO
import xml.etree.ElementTree as ET
import tqdm
import pandas as pd
import numpy as np
import imagehash

SCRAPE_DIR = '.scrape_output'

### Steps to fetch the xml files
1. Install `pip install oaiharvest`
2. Create a folder in this repo called '.scrape_output'
3. `cd .scrape_output`
4. Run the following command: `oai-harvest --set=Beelddocument --metadataPrefix=ese https://denbosch.hosting.deventit.net/atlantispubliek/oai.axd` (This may take a while)

### Test code to get the format of the xml file

In [None]:
tree = ET.parse('.scrape_output/14433440.ese.xml')
root = tree.getroot()

items = []

for child in root:
    prefix, has_namespace, postfix = child.tag.partition('}')
    items.append({
        'key': postfix,
        'value': child.text
    })

pd.DataFrame(items)

### Create a list of the XML files

In [None]:
image_xml_files = [SCRAPE_DIR + '/' + name for name in os.listdir(SCRAPE_DIR)]
image_xml_files[:3]

#### Loop through the images and transform them into a dictionaries

In [None]:
from tqdm import tqdm
images = []

# xml namespace
ns = {
    'dc': 'http://purl.org/dc/elements/1.1/',
    'europeana': 'http://www.europeana.eu/schemas/ese/',
    'dcterms': 'http://purl.org/dc/terms/'
}

# No image hash
no_img = '00ff6767777f8300'

for idx, image_xml in tqdm(enumerate(image_xml_files)):
    
    # Parse as xml
    tree = ET.parse(image_xml)
    
    # Get the root of the tree
    root = tree.getroot()
    
    # Dictionary to store the image attributes
    image_attribs = {}
    
    # Location
    coverage = root.find('dc:coverage', ns)
    
    if coverage == None:
        continue
    else:
        coverage = coverage.text.split(';')
        if coverage[0] != "'s-Hertogenbosch":
            continue
        elif coverage[1] == '':
            continue
        
    street = coverage[1]
    image_attribs['street'] = street
        
    # Medium
    medium = root.find('dcterms:medium', ns)
    
    # Make sure the image is a photo
    if medium == None:
        continue
    elif medium.text != 'Foto':
        continue
    
    # Image url
    image_url = root.find('europeana:object', ns).text
    
    # Check if the image exists
    try:
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content))
        hasj = str(imagehash.average_hash(image))
    except:
        continue
    
    # If there is no image
    if hasj == no_img:
        image_attribs['available'] = 0
    else: image_attribs['available'] = 1

    # Size of the image
    image_attribs['width'], image_attribs['height'] = image.size
    
    # Image url
    image_url = root.find('europeana:object', ns).text
    
    # Image url
    image_attribs['url'] = image_url
    
    # Get the title
    image_attribs['title'] = root.find('dc:title', ns).text.strip()
            
    # Date
    if root.find('dc:date', ns).text == None or len(root.find('dc:date', ns).text[-4:]) < 4 or '/' in root.find('dc:date', ns).text[-4:]:
        image_attribs['year'] = None
    else: 
        image_attribs['year'] = root.find('dc:date', ns).text[-4:]
        
    
    # Add to list
    images.append(image_attribs)

In [None]:
df = pd.DataFrame(images)
df.head()

In [None]:
# Import dataframe into MySQL
import sqlalchemy
database_username = 'remote'
database_password = 'EtrPCEc0jt'
database_ip       = '165.22.199.122'
database_name     = 'scavenger'
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))
df.to_sql(con=database_connection, name='images', if_exists='replace')

In [None]:
import requests
import json

# HERE app_id & app_code
APP_ID = 'oidKwC98hLcIQBaboIwS'
APP_CODE = 'G6gKSvk45y4fF14hwpIeYg'

# Function that collects multiple location-coordinates of a single street
def get_location(streetname):
    '''Returns list of latitude and longitude coordinates for a given street'''
    
    if streetname == '':
        return None
    
    url = 'https://geocoder.api.here.com/6.2/geocode.json?app_id=' + APP_ID + '&app_code=' + APP_CODE + '&searchtext='

    r = requests.get(url + streetname + ", 's-Hertogenbosch").json()
    
    if r['Response']['View'] != []:
        
        loc_data = r['Response']['View'][0]['Result'][0]['Location']
        
        #DisplayPosition
        loc1 = loc_data['DisplayPosition']
    
        #NavigationPosition
        loc2 = loc_data['NavigationPosition'][0]
    
        #MapView
        loc3A = loc_data['MapView']['TopLeft']
        
        loc3B = loc_data['MapView']['BottomRight']
        
        #Avg coordinates b/w Mapview TopLeft & Mapview BottomRight
        loc4 = ((loc3A['Latitude'] + loc3B['Latitude']) / 2, (loc3A['Longitude'] + loc3B['Longitude']) / 2 )
        
        return [loc1['Latitude'], loc1['Longitude'], 
                loc2['Latitude'], loc2['Longitude'], 
                loc3A['Latitude'], loc3A['Longitude'],
                loc3B['Latitude'], loc3B['Longitude'], 
                loc4[0], loc4[1]]
    
    else:
        return None

In [None]:
# All streetnames present in the dataframe
street_names = list(df['street'].unique())

# Remove from street_names list: nan value at index location 66, '' at index location 1, and 'onbekend' at index location 537
del street_names[66]
del street_names[1]
del street_names[537]

# Generate dictionary with street location and all corresponding locations
street_locations = {street : get_location(street) for street in street_names}

In [None]:
# Find all streets that API did NOT find location coordinates for
failed_location = [street[0] for street in street_locations.items() if street[1] == None]

In [None]:
# Remove streets from street_locations dictionary that do not have any coordinates
[street_locations.pop(street) for street in failed_location]

In [None]:
# Column order of coordinates (dis = display location, nav = navigation location, map_top = mapview topleft, map_bot = mapview bottomright, map_avg = mapview average)
column_order = ['dis_lat', 'dis_long', 'nav_lat', 'nav_long', 'map_top_lat', 'map_top_long', 'map_bot_lat', 'map_bot_long', 'map_avg_lat', 'map_avg_long']

In [None]:
# Create dataframe with streets as rows and coordinates as columns
loc_df = pd.DataFrame.from_dict(street_locations, orient='index', columns=column_order)
loc_df.reset_index(level=0, inplace=True)
loc_df.rename(columns={'index':'street'}, inplace=True)
loc_df.head()

In [None]:
# Import dataframe into MySQL
import sqlalchemy
database_username = 'remote'
database_password = 'EtrPCEc0jt'
database_ip       = '165.22.199.122'
database_name     = 'scavenger'
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))
loc_df.to_sql(con=database_connection, name='locations', if_exists='replace')