Scrape ONS web page for dataset metadata and location of download.

Adds a couple of functions:

1. `scrape(pageURL)` fetches metadata from an ONS dataset web page
2. `writeMetadata(m, l)` generates RDF metadata in Trig format, using the given label.

In [26]:
from databaker.framework import *
import pandas as pd
import requests
from pathlib import Path
from io import BytesIO
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from lxml import html
from urllib.parse import urlparse, urljoin
import re
import rdflib

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

def scrape(pageURL):
    page = session.get(pageURL)
    tree = html.fromstring(page.text)
    return {
        'title': tree.xpath("//h1/text()")[0].strip(),
        'releaseDate': pd.to_datetime(tree.xpath("//span[text() = 'Release date: ']/parent::node()/text()")[1].strip()).tz_localize('Europe/London').date(),
        'nextRelease': pd.to_datetime(tree.xpath("//span[text() = 'Next release: ']/parent::node()/text()")[1].strip()).tz_localize('Europe/London').date(),
        'mailto': tree.xpath("//span[text() = 'Contact: ']/following-sibling::a[1]/@href")[0].strip(),
        'fileURL': urljoin(pageURL, tree.xpath("//a[starts-with(@title, 'Download as xls')]/@href")[0].strip()),
        'about': tree.xpath("//h2[text() = 'About this dataset']/following-sibling::p/text()")[0].strip()
    }

def pathify(label):
    return re.sub('-\$', '',
        re.sub('-+', '-',
            re.sub('[^\\w/]', '-', label.lower())))

def writeMetadata(metadata, label, family=None):
    from rdflib import URIRef, RDF, Namespace, Literal, RDFS, Dataset
    from rdflib.namespace import DCTERMS, VOID
    import pytz

    destinationFolder = Path('out')
    destinationFolder.mkdir(exist_ok=True, parents=True)

    modified_date = pd.to_datetime('now').tz_localize('Europe/London')
    datasetPath = pathify(label)
    
    base = 'http://gss-data.org.uk'
    
    PMD = Namespace('http://publishmydata.com/def/dataset#')
    QB = Namespace('http://purl.org/linked-data/cube#')
    GDP = Namespace(f'{base}/def/gdp#')
    ONS = URIRef('https://www.gov.uk/government/organisations/office-for-national-statistics')
    OGL_3 = URIRef('http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/')
    
    quads = Dataset()
    quads.bind('pmd', PMD)
    quads.bind('qb', QB)
    quads.bind('dct', DCTERMS)
    quads.bind('void', VOID)
    quads.bind('gdp', GDP)
    md = quads.graph(URIRef(f'{base}/graph/{datasetPath}/metadata'))
    ds = rdflib.URIRef(f'{base}/data/{datasetPath}')
    md.add((ds, RDF.type, PMD.LinkedDataset))
    md.add((ds, RDF.type, PMD.Dataset))
    md.add((ds, RDF.type, QB.DataSet))
    if 'about' in metadata:
        md.add((ds, RDFS.comment, Literal(metadata['about'], 'en')))
    if 'title' in metadata:
        md.add((ds, RDFS.label, Literal(metadata['title'], 'en')))
        md.add((ds, DCTERMS.title, Literal(metadata['title'], 'en')))
    if 'mailto' in metadata:
        md.add((ds, PMD.contactEmail, URIRef(metadata['mailto'])))
    if 'releaseDate' in metadata:
        md.add((ds, DCTERMS.issued, Literal(metadata['releaseDate'])))
    if 'nextRelease' in metadata:
        md.add((ds, PMD.nextUpdateDue, Literal(metadata['nextRelease'])))
    md.add((ds, DCTERMS.modified, Literal(modified_date)))
    md.add((ds, DCTERMS.creator, ONS))
    md.add((ds, DCTERMS.license, OGL_3))
    md.add((ds, DCTERMS.publisher, ONS))
    md.add((ds, VOID.sparqlEndpoint, URIRef(f'{base}/sparql')))
    md.add((ds, PMD.graph, URIRef(f'{base}/graph/{datasetPath}')))
    if family:
        md.add((ds, GDP.family, GDP[pathify(family)]))
    
    with open(destinationFolder / 'dataset.trig', 'wb') as f:
        quads.serialize(destination=f, format='trig')