In [6]:
from lxml import etree
import re
import os
import glob
from dateutil import parser
from datetime import datetime
from zipfile import ZipFile

In [None]:
outdir = "../data/processed/b3514S_akn"
odt = glob.glob("../data/raw/B3514S/*.odt")

In [2]:
def rename_internal_refs(bill_elem):
    for ref in bill_elem.xpath("./body//reference-ref"):
        ref.tag = "ref"
        href = bill_elem.xpath(".//*[@wId='{}']/@eId".format(ref.attrib.pop("ref-name")))[0]
        ref.attrib['href'] = "#"+href
        ref.attrib.pop("reference-format")
    return bill_elem

def nest_eIds(bill_elem):
    for sub in bill_elem.xpath("./body//subsection"):
        sub.attrib['eId'] = sub.getparent().attrib['eId']+"-"+sub.attrib['eId']
    for para in bill_elem.xpath("./body//paragraph"):
        para.attrib['eId'] = para.getparent().attrib['eId']+"-"+para.attrib['eId']
    for subp in bill_elem.xpath("./body//subparagraph"):
        subp.attrib['eId'] = subp.getparent().attrib['eId']+"-"+subp.attrib['eId']
    return bill_elem


def rename_toc_refs(bill_elem):
    for toc_item in bill_elem.findall("./coverPage/toc/tocItem"):
        if toc_item.attrib['class'].endswith("number"):
            href = toc_item.attrib['href'][1:]
            toc_item.attrib['href'] = "#" + bill_elem.xpath('./body//*[@toc="{}"]/@eId'.format(href))[0]
        else:
            toc_item.attrib['href'] = toc_item.getprevious().attrib['href']
            
    return bill_elem

In [3]:
def get_attribs(elem):
    toc = elem.xpath("./bookmark-start/@name")[0]
    eId = elem.xpath("./sequence/@eId")[0]
    wId = elem.xpath("./sequence/@wId")[0]
    return {"toc": toc, "eId": eId, "wId": wId}

def part_container(body, text):
    '''
    Start from parts and nest downwards
    '''
    parts = text.xpath("./p[@style-name='partnumber']")
    for part in parts:
        attrib = get_attribs(part)
        num = part.find("./sequence")
        
        container = etree.SubElement(body, "part", attrib)
        part_num = etree.SubElement(container, "num")
        part_num.text = "PART " + num.text 
        heading = part.getnext()
        heading.tag = "heading"
        container.append(heading)
        index = container.index(heading)
        '''
        TO DO: Get the loop below working!
        '''
        for child in part.getchildren():
            for text in [child.text, child.tail]:
                if text == "PART":
                    text = ""
                
        content = etree.SubElement(container, "content")
        for p in part.xpath("./following-sibling::*"):
            if p.attrib['style-name'] in ["partnumber", "schedulenumber", "backcover"]:
                break
            else:
                container.append(p)
        
        content.insert(index+1, part)
        container = section_container(container)
    return body    
                
def chapter_container(body):
    chapters = body.xpath(".//p[@style-name='chapternumber]")
    return body
    
def section_container(container):
    sections = container.xpath(".//p[starts-with(./sequence/@eId, 'section')]")
    for section in sections:
        #this is the shouldernote
        heading = section.getprevious()
        heading.tag = "heading"
        
        toc = heading.xpath("./bookmark-start/@name")[0]
        eId = section.xpath("./sequence/@eId")[0]
        wId = section.xpath("./sequence/@wId")[0]
        attrib =  {"toc": toc, "eId": eId, "wId": wId}
        section_container = etree.Element("section", attrib)
        num = etree.Element("num")
        num.text = section.find("./sequence").text #+"."
        
        section_container.append(heading)
        section_container.append(num)
        section_content = etree.SubElement(section_container, "content")
        index = section_container.index(heading)
        
        for p in section.xpath("./following-sibling::*"): 
            if "style-name" not in p.attrib or p.attrib['style-name'] == "shouldernote":
                break
            else:
                section_container.append(p)
        
        section_content.insert(index+1, section)
        section_container = subsection_container(section_container, "subsection")
        section_container = subsection_container(section_container, "paragraph")
        container.append(section_container)
    return container

def subsection_container(container, container_type):
    subsections = container.xpath("./p[starts-with(./sequence/@eId, '{}')]".format(container_type))
    
    for subsection in subsections:
        sub_container = etree.Element(container_type)
        index = subsection.getparent().index(subsection)
        num = subsection.find(".//sequence")
                 
        sub_container.attrib['eId'] = num.attrib['eId']
        sub_container.attrib['wId'] = num.attrib['wId']
        sub_num = etree.SubElement(sub_container, "num")
        sub_num.text = num.text
        
        #etree.strip_tags(subsection, "sequence")
        subsection_content = etree.SubElement(sub_container, "content")
        
        for p in subsection.xpath("./following-sibling::*"): 
            if len(p.xpath('.//sequence[starts-with(@eId, "{}")]'.format(container_type))) > 0:
                break
            else:
                sub_container.append(p)
        subsection_content.append(subsection)
        sub_container = subsection_container(sub_container, "paragraph")
        
        section_container = subsection_container(sub_container, "subparagraph")
        for p in sub_container.xpath("./p"):
            subsection_content.append(p)
        container.insert(index, sub_container)
    return container

def schedule_container(schedule):
    toc = schedule.xpath("./bookmark-start/@name")[0]
    seq = schedule.find("./sequence")
    eId = seq.attrib['eId']
    wId = seq.attrib['wId']
    sched_container = etree.Element("hcontainer", {"name": "schedule", "toc": toc, "wId": wId, "eId": eId})
    num = etree.SubElement(sched_container, "num")
    num.text = seq.text
    content = etree.SubElement(sched_container, "content")
    for p in schedule.xpath("./following-sibling::*"):
        if p.attrib['style-name'] in ["backcover", "schedulenumber"]:
            break
        else:
            content.append(p)     
    content.insert(0, schedule)    
    return sched_container

def backcover_container(backpage):
    conclusions = etree.Element("conclusions")
    container = etree.SubElement(conclusions, "container", {"name": "backcover"})
    container.append(backpage)
    return conclusions
    

In [4]:
AKN_3 = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD13"

def strip_ns_prefix(tree):
    #xpath query for selecting all element nodes in namespace
    query = "descendant-or-self::*[namespace-uri()!='']"
    #for each element returned by the above xpath query...
    for element in tree.xpath(query):
        #replace element name with it's local name
        element.tag = etree.QName(element).localname
        attrib = element.attrib
        for key in attrib:    
            attrib[etree.QName(key).localname] = attrib.pop(key)
    return tree

def add_ns_prefix(tree):
    #xpath query for selecting all element nodes in namespace
    query = "descendant-or-self::*[namespace-uri()='']"
    #for each element returned by the above xpath query...
    for element in tree.xpath(query):
        #replace element name with it's local name
        element.tag = etree.QName("{"+AKN_3+"}"+element.tag)
        #print(element.tag)        
    return tree

def get_text(root):
    root = strip_ns_prefix(root)
    text = root.find("body/text")
    text.remove(text.find("./forms"))
    text.remove(text.find("./sequence-decls"))
    for decl in text.xpath("./user-field-decls/user-field-decl[starts-with(@name, '-pi-')]"):
        decl.getparent().remove(decl)
    #"hidden-text", 
    etree.strip_tags(text, "span", "alphabetical-index-mark", "soft-page-break")
    return text

def resequence_numbers(text):
    for num in text.xpath(".//sequence"):
        #num.tag = "num"
        eId = num.attrib['name'].replace("number", "")+"_"+num.text
        for key in num.attrib:
            num.attrib.pop(key)
            num.attrib["eId"] = eId
        try:
            wId = num.xpath("./following-sibling::reference-mark-end[1]/@name")[0]
            num.attrib['wId'] = wId
        except IndexError:
            pass
    etree.strip_tags(text, ["reference-mark-start", "reference-mark-end"])
    '''
    reorganise section paragraphs containing subsection numbers so that
    every p element has only one sequence child
    '''
    for section in text.xpath(".//p[@style-name='subsection'][starts-with(./sequence/@eId, 'section')]"):  
        p = etree.Element("p", {"style-name": "section"})
        p.append(section.find("./eol"))  
        p.append(section.find("./sequence"))
        text.insert(text.index(section), p)
        
             
    return text
    
def back_cover(text):
    '''
    Backcover is a table, with a style name mapped to automatic-styles. The table style-name can vary
    but the master-page-name mapping is constant.
    
    
    Variable event is a narrative summary of the event which gave rise to the preparation of the Bill Expression
    
    Variable date is the date when an initiating or amending event occurred.

    '''
    backcover_name = root.xpath("./automatic-styles/style[@master-page-name='backcover']/@name")[0]
    
    last_table = text.xpath(".//table[@style-name='{}']".format(backcover_name))[0]
    last_table.attrib['style-name'] = "backcover"
    date = last_table.xpath(".//table-cell//p[re:match(., '^\d{1,2}\w{2}\s\w+,\s\d{4}$')][1]", 
                  namespaces={"re": "http://exslt.org/regular-expressions"})
    
    assert len(date)==1, "Date search returned: {}".format(len(date))
    event = "".join(date[0].getprevious().xpath(".//text()"))
    date = str(parser.parse(date[0].text).date())
     
    print(event, date)
    #text.remove(last_table)
    return date, event

def get_metadata(text):
    '''
    Footer contains Bill No. and year in the format "No. 35a of 2014", where the No. is the
    sequence number of the Bill in that year and the letter (a,b,c,d,e) corresponding to second and subsequent
    versions of the Bill. If there is no letter, the Bill is as initiated.
    '''
    footer = text.xpath(".//user-field-decl[@name='Footer']/@string-value")[0]
    #bill_status = text.xpath(".//user-field-decl[@name='EN_DOC_STAGE']/@string-value")[0]
    bill_id = re.search("No\.\s(?P<no>\d+)(?P<version>[a-z])?\sof\s(?P<year>\d{4})", footer)
    assert bill_id is not None, "No match for Bill ID"
    bill_version = "ver_"+bill_id.group("version") if bill_id.group("version") else "initiated"
    print("Version", bill_version)
    bill_uri = "/ie/oireachtas/bill/{}/{}".format(bill_id.group("year"), bill_id.group("no"))
    
    return bill_uri, bill_version

def FRBR_elems(text, date, event):
    '''
    To do: use event data to populate FRBRExpression/FRBRdate/@name+FRBRauthor/@href+sponsor
    attributes
    '''
    bill_uri, bill_version = get_metadata(text)
    bill_elem = etree.Element("bill", name="Bill")
    meta = etree.SubElement(bill_elem, "meta")
    identification = etree.SubElement(meta, "identification", {"source": "#BillsOffice"})
    
    frW = etree.SubElement(identification, "FRBRWork")
    etree.SubElement(frW, "FRBRthis", {"value": bill_uri+"/main"})
    etree.SubElement(frW, "FRBRuri", {"value": bill_uri})
    etree.SubElement(frW, "FRBRdate", {"date": date, "name": "presented"})
    etree.SubElement(frW, "FRBRauthor", {"href": "#sponsor?", "as": "sponsor"})
    etree.SubElement(frW, "FRBRcountry", {"value": "#ie"})

    frE = etree.SubElement(identification, "FRBRExpression")
    expression_uri = bill_uri+"/eng@"+bill_version
    etree.SubElement(frE, "FRBRthis", {"value": expression_uri+"/main"})
    etree.SubElement(frE, "FRBRuri", {"value": expression_uri})
    etree.SubElement(frE, "FRBRdate", {"date": date, "name": "published"})
    etree.SubElement(frE, "FRBRauthor", {"href": "?sponsor?", "as": "sponsor"})
    #This might not be valid
    etree.SubElement(frE, "FRBRauthor", {"href": "?committee?", "as": "committee"})
    etree.SubElement(frE, "FRBRlanguage", {"language": "eng"})

    frM = etree.SubElement(identification, "FRBRManifestation")
    etree.SubElement(frM, "FRBRthis", {"value": expression_uri+"/main.xml"})
    etree.SubElement(frM, "FRBRuri", {"value": expression_uri+".xml"})
    etree.SubElement(frM, "FRBRdate", {"date": str(datetime.today().date()), "name": "transformed"})
    etree.SubElement(frM, "FRBRauthor", {"href": "#BillsOffice", "as": "editor"})
    
    return bill_elem

def coverpage_elems(text, bill_elem):
    coverPage = etree.SubElement(bill_elem, "coverPage")
    toc = etree.SubElement(coverPage, "toc", {'class': "body"})

    for tocItem in text.xpath("./p[starts-with(@style-name, 'toc')][./bookmark-ref]"):
        attrib = {"href": "#"+tocItem.find("./bookmark-ref").attrib['ref-name'],
                  "class": tocItem.attrib['style-name'][3:],
                 "level": "1"}
        toc_item = etree.SubElement(toc, "tocItem", attrib)
        toc_no = etree.SubElement(toc_item, "inline", {"name": "tocNum"})
        toc_no.text = tocItem.text
        #toc_item.text = None
        toc_hd = etree.SubElement(toc_item, "inline", {"name": "tocHeading"})
        toc_hd.text = tocItem.find("./bookmark-ref").text

    act_ref = etree.SubElement(coverPage, "container", {"name": "actsReferredTo"})
    #refs = etree.SubElement(act_ref, "content")
    
    '''
    To do: Add references to other Acts as Active Modification metadata
    '''
    
    act_text = etree.SubElement(act_ref, "p")
    act_text.text = "Acts Referred To"
    for p in text.xpath("./alphabetical-index[@name='Alphabetical Index1']/index-body/p"):
        p.attrib.pop("style-name")
        ref = re.search("(?P<year>\d{4})\s\(No\.\s(?P<no>\d+)\)$", "".join(p.xpath(".//text()")))
        if ref is not None:
            act_uri = "#act.{}.{}".format(ref.group("year"), ref.group("no"))

            ref = etree.SubElement(p, "ref", {"href": act_uri})
            ref.text = "".join(p.xpath(".//text()"))
            p.text = None
        act_ref.append(p)
    return bill_elem
        
def preface_elems(text, bill_elem):
    preface = etree.SubElement(bill_elem, "preface")
    ga_p = etree.SubElement(preface, "p", {"class": "ga_shorttitle"})
    ga_shorttitle = etree.SubElement(ga_p, "shortTitle")
    ga_shorttitle.text = text.xpath(".//user-field-decl[@name='GA_shorttitle']/@string-value")[0]

    en_p = etree.SubElement(preface, "p", {"class": "en_shorttitle"})
    en_shorttitle = etree.SubElement(en_p, "shortTitle")
    en_shorttitle.text = text.xpath(".//user-field-decl[@name='EN_shorttitle']/@string-value")[0]

    ga_stage = etree.SubElement(preface, "p", {"class": "ga_stage"})
    ga_docStage = etree.SubElement(ga_stage, "docStage")
    ga_docStage.text = text.xpath(".//user-field-decl[@name='GA_DOC_STAGE']/@string-value")[0]

    en_stage = etree.SubElement(preface, "p", {"class": "en_stage"})
    en_docStage = etree.SubElement(en_stage, "docStage")
    en_docStage.text = text.xpath(".//user-field-decl[@name='EN_DOC_STAGE']/@string-value")[0]

    status_p = etree.SubElement(preface, "p", {"class": "status"})
    docStatus = etree.SubElement(status_p, "docStatus")
    docStatus.text = text.xpath(".//user-field-decl[@name='STATUS']/@string-value")[0]

    entitled = etree.SubElement(preface, "p")
    entitled.text = "".join(text.xpath("./p[@style-name='entitled']//text()"))

    long_title = etree.SubElement(preface, "longTitle")
    long_p = text.find("./p[@style-name='longtitle']")
    for key in long_p.attrib:
        long_p.attrib.pop(key)
    long_title.append(long_p)

    for p in text.findall("./p"):
        if p.attrib['style-name'] == "enacted":
            break
        else:
            text.remove(p)
    return bill_elem       

def preamble_elems(text, bill_elem):
    preamble = etree.SubElement(bill_elem, "preamble")
    enacted = text.find("./p[@style-name='enacted']")
    for key in enacted.attrib:
        enacted.attrib.pop(key)
    preamble.append(enacted)
    return bill_elem

In [20]:
for fn in odt:
    print(fn)
    obj = os.path.basename(fn).split()[0].split(".")[0]
    
    z = ZipFile(fn)
    with z.open("content.xml") as f:
        content_xml = f.read().replace(b"<text:tab/>", b" ")
        root = etree.fromstring(content_xml)
        
        text = get_text(root)
        
        para_styles = {p.attrib['name']:p.attrib['parent-style-name'] 
                       for p in root.xpath("./automatic-styles/style[@family='paragraph']")}
        
        for p in text.findall(".//p[@style-name]"):
            if p.attrib['style-name'] in para_styles:
                p.attrib['style-name'] = para_styles[p.attrib['style-name']]
        
        for eol in text.xpath(".//hidden-text"):
            eol.tag = "eol"
            number = eol.attrib['string-value']
            for key in eol.attrib:
                eol.attrib.pop(key)
            eol.attrib['number'] = number
        text = resequence_numbers(text)
        date, event = back_cover(root)
        bill_elem = FRBR_elems(text, date, event)
        bill_elem = coverpage_elems(text, bill_elem)
        bill_elem = preface_elems(text, bill_elem)
        bill_elem = preamble_elems(text, bill_elem)
        #text.remove(text.find("user-field-decls"))
        text.remove(text.find("alphabetical-index"))
        body = etree.SubElement(bill_elem, "body")
        
        
        if text.find("./p[@style-name='partnumber']") is not None:
            body = part_container(body, text)
        else:
            #Creating temporary container to pass to section_container
            container = etree.Element("temp_container")
            index = text.index(text.find(".//p[starts-with(./num/@eId, 'section')]"))
            for child in text.getchildren()[index:]:
                container.append(child)
            container = section_container(container)
            for child in container.getchildren():
                body.append(child)
       
        schedules = text.xpath("./p[@style-name='schedulenumber']")
        for schedule in schedules:
            body.append(schedule_container(schedule))

        backpage = text.xpath("./table[@style-name='backcover']")[0]
        body.append(backcover_container(backpage))
        
        bill_elem = nest_eIds(bill_elem)
        bill_elem = rename_toc_refs(bill_elem)
        bill_elem = rename_internal_refs(bill_elem)
        
        for a in bill_elem.findall(".//a"):
            #print(a.attrib)
            a.attrib.pop("type")
            regex = re.compile("book\.ie/(?P<year>\d{4})/\w+/\w+/\w+/(?P<no>\d+)/index.html")
            act = regex.search(a.attrib['href'])
            if act is not None:
                a.attrib['href'] = "#act."+act.group("year")+"."+act.group("no").lstrip('0')
            
        etree.strip_tags(bill_elem, ["alphabetical-index-mark-start",
                                    "alphabetical-index-mark-end",
                                    "sequence",
                                    "bookmark-start",
                                    "bookmark-end"])
        with open(os.path.join(outdir, obj+"_text.xml"), "wb") as f:
            f.write(etree.tostring(text, encoding="utf-8", pretty_print=True, xml_declaration=True))
        with open(os.path.join(outdir, obj+"_akn.xml"), "wb") as f:
            f.write(etree.tostring(bill_elem, encoding="utf-8", pretty_print=True, xml_declaration=True))

../data/raw/B3514S\B3514S.odt
Presented by Senator Maurice Cummins on behalf of the Minister for Transport, Tourism and Sport, 2014-04-17
Version initiated
../data/raw/B3514S\B35a14S.odt
Ordered by Seanad Éireann to be printed, 2014-06-10
Version ver_a
../data/raw/B3514S\B35b14S.odt
Passed by Seanad Éireann, 2014-06-12
Version ver_b
../data/raw/B3514S\b35c14S.odt
As amended in the Select sub-Committee on Transport, Tourism and Sport 2014-07-03
Version ver_c
../data/raw/B3514S\b35d14S.odt
Passed by Dáil Éireann, 2014-07-09
Version ver_d
