In [None]:
from databaker.framework import *
import pandas as pd
import requests
from pathlib import Path
from io import BytesIO
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from lxml import html
from urllib.parse import urlparse, urljoin
import re
import rdflib
import html2text
import re

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

def toMarkdown(node):
    return html2text.html2text(html.tostring(node, encoding='unicode'))

def scrape(pageURL):
    date_re = re.compile('[0-9]{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) [0-9]{4}')
    page = session.get(pageURL)
    tree = html.fromstring(page.text)
    md = {}
    md['title'] = tree.xpath("//h1/text()")[0].strip()
    dates = tree.xpath("//div[contains(concat(' ', @class, ' '), 'app-c-published-dates')]/text()")
    if len(dates) > 0 and dates[0].strip().startswith('Published '):
        match = date_re.search(dates[0])
        if match:
            md['published'] = pd.to_datetime(match.group(0)).tz_localize('Europe/London').date()
    if len(dates) > 1 and dates[1].strip().startswith('Last updated '):
        match = date_re.search(dates[1])
        if match:
            md['lastUpdated'] = pd.to_datetime(match.group(0)).tz_localize('Europe/London').date()
    md['files'] = [
        {
            'url': urljoin(pageURL, attachment_section.xpath("div/h2[@class='title']/a/@href")[0].strip()),
            'title': attachment_section.xpath("div/h2[@class='title']/a/text()")[0].strip(),
            'type': attachment_section.xpath("div/p[@class='metadata']/span[@class='type']/descendant-or-self::*/text()")[0].strip()
        } for attachment_section in tree.xpath("//section[contains(concat(' ', @class, ' '), 'attachment')]")]
    nextReleaseNodes = tree.xpath("//p[starts-with(text(), 'Next release of these statistics:')]/text()")
    if nextReleaseNodes and (len(nextReleaseNodes) > 0):
        md['nextRelease'] = pd.to_datetime(
            nextReleaseNodes[0].strip().split(':')[1].split('.')[0].strip()
        ).tz_localize('Europe/London').date()
    md['details'] = toMarkdown(tree.xpath("//h2[text() = 'Details']/following-sibling::div")[0])
    from_link = tree.xpath("//span[contains(concat(' ', @class, ' '), 'app-c-publisher-metadata__definition_sentence')]/a/@href")
    if len(from_link) > 0:
        md['sourceOrg'] = urljoin(pageURL, from_link[0])
    return md

def pathify(label):
    return re.sub('-\$', '',
        re.sub('-+', '-',
            re.sub('[^\\w/]', '-', label.lower())))

def writeMetadata(metadata, label, comment, family):
    from rdflib import URIRef, RDF, Namespace, Literal, RDFS, Dataset
    from rdflib.namespace import DCTERMS, VOID
    import pytz

    destinationFolder = Path('out')
    destinationFolder.mkdir(exist_ok=True, parents=True)

    modified_date = pd.to_datetime('now').tz_localize('Europe/London')
    datasetPath = pathify(label)
    
    base = 'http://gss-data.org.uk'
    
    PMD = Namespace('http://publishmydata.com/def/dataset#')
    QB = Namespace('http://purl.org/linked-data/cube#')
    GDP = Namespace(f'{base}/def/gdp#')
    OGL_3 = URIRef('http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/')
    
    quads = Dataset()
    quads.bind('pmd', PMD)
    quads.bind('qb', QB)
    quads.bind('dct', DCTERMS)
    quads.bind('void', VOID)
    quads.bind('gdp', GDP)
    md = quads.graph(URIRef(f'{base}/graph/{datasetPath}/metadata'))
    ds = rdflib.URIRef(f'{base}/data/{datasetPath}')
    md.add((ds, RDF.type, PMD.LinkedDataset))
    md.add((ds, RDF.type, PMD.Dataset))
    md.add((ds, RDF.type, QB.DataSet))
    md.add((ds, RDFS.comment, Literal(comment, 'en')))
    if 'details' in metadata:
        md.add((ds, DCTERMS.description, Literal(metadata['details'], 'en')))
    if 'title' in metadata:
        md.add((ds, RDFS.label, Literal(metadata['title'], 'en')))
        md.add((ds, DCTERMS.title, Literal(metadata['title'], 'en')))
    if 'mailto' in metadata:
        md.add((ds, PMD.contactEmail, URIRef(metadata['mailto'])))
    if 'published' in metadata:
        md.add((ds, DCTERMS.issued, Literal(metadata['published'])))
    if 'nextRelease' in metadata:
        md.add((ds, PMD.nextUpdateDue, Literal(metadata['nextRelease'])))
    md.add((ds, DCTERMS.modified, Literal(modified_date)))
    if 'sourceOrg' in metadata:
        md.add((ds, DCTERMS.creator, URIRef(metadata['sourceOrg'])))
        md.add((ds, DCTERMS.publisher, URIRef(metadata['sourceOrg'])))
    md.add((ds, DCTERMS.license, OGL_3))
    md.add((ds, VOID.sparqlEndpoint, URIRef(f'{base}/sparql')))
    md.add((ds, PMD.graph, URIRef(f'{base}/graph/{datasetPath}')))
    if family:
        md.add((ds, GDP.family, GDP[pathify(family)]))
    
    with open(destinationFolder / 'dataset.trig', 'wb') as f:
        quads.serialize(destination=f, format='trig')