In [2]:
from lxml import etree
import re
from dateutil import parser
from datetime import datetime

In [3]:
root = etree.parse("../data/interim/B3015D_unzipped_odt/content.xml").getroot()

In [5]:
schema = etree.XMLSchema(etree.parse("http://docs.oasis-open.org/legaldocml/akn-core/v1.0/csprd01/part2-specs/schemas/akomantoso30.xsd"))

In [45]:
AKN_3 = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD13"
XSI = "http://www.w3.org/2001/XMLSchema-instance"
SCHEMA_LOCATION = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD13 ./akomantoso30.xsd "

nsmap = {None: AKN_3,
        'xsi': XSI}

AK = etree.Element("{"+AKN_3+"}akomaNtoso", nsmap=nsmap, attrib={"{"+XSI+"}schemaLocation": SCHEMA_LOCATION  })

In [46]:
bill_odt = "../data/interim/B3015D_unzipped_odt/content.xml"
bill_interim = "../data/interim/B3015D_interim.xml"
bill_akn = "../data/processed/B3015D.akn.xml"

with open(bill_odt, "rb") as f:
        root = etree.fromstring(f.read().replace(b"<text:tab/>", b" "))
    
root = strip_ns_prefix(root)

text = root.find("{*}body/{*}text")
text.remove(text.find("./forms"))
text.remove(text.find("./sequence-decls"))

backcover_name = root.xpath("./automatic-styles/style[@master-page-name='backcover']/@name")[0]
last_table = text.xpath("./table[@style-name='{}']".format(backcover_name))[0]


for decl in text.xpath("./user-field-decls/user-field-decl[starts-with(@name, '-pi-')]"):
    decl.getparent().remove(decl)

#for hidden in text.findall(".//hidden-text"):
#    hidden.getparent().attrib['page-line'] = hidden.attrib['string-value']
etree.strip_tags(text, "hidden-text", "span", "alphabetical-index-mark")

date = last_table.xpath(".//table-cell//p[re:match(., '^\d{1,2}\w{2}\s\w+,\s\d{4}$')][1]", 
                  namespaces={"re": "http://exslt.org/regular-expressions"})


assert len(date)==1, "Multiple values for date search"
date = str(parser.parse(date[0].text).date())
print(date)
text.remove(last_table)

footer = text.xpath(".//user-field-decl[@name='Footer']")[0].attrib['string-value']
bill_status = text.xpath(".//user-field-decl[@name='EN_DOC_STAGE']")[0].attrib['string-value']
bill_id = re.search("No\.\s(?P<no>\d+)(?P<version>[a-z])?\sof\s(?P<year>\d{4})", footer)
assert bill_id is not None, "No match for Bill ID"
bill_uri = "/ie/oireachtas/bill/{}/{}".format(bill_id.group("year"), bill_id.group("no"))


bill_elem = etree.Element("bill", name="Bill")
meta = etree.SubElement(bill_elem, "meta")
identification = etree.SubElement(meta, "identification", {"source": "#BillsOffice"})
frW = etree.SubElement(identification, "FRBRWork")
etree.SubElement(frW, "FRBRthis", {"value": bill_uri+"/main"})
etree.SubElement(frW, "FRBRuri", {"value": bill_uri})

etree.SubElement(frW, "FRBRdate", {"date": date, "name": "presented"})
etree.SubElement(frW, "FRBRauthor", {"href": "#sponsor?", "as": "sponsor"})
etree.SubElement(frW, "FRBRcountry", {"value": "#ie"})

frE = etree.SubElement(identification, "FRBRExpression")
expression_uri = bill_uri+"/eng@"+bill_status.split()[-1]
etree.SubElement(frE, "FRBRthis", {"value": expression_uri+"/main"})
etree.SubElement(frE, "FRBRuri", {"value": expression_uri})
etree.SubElement(frE, "FRBRdate", {"date": date, "name": "published"})
etree.SubElement(frE, "FRBRauthor", {"href": "?sponsor?", "as": "sponsor"})
etree.SubElement(frE, "FRBRlanguage", {"language": "eng"})

frM = etree.SubElement(identification, "FRBRManifestation")
expression_uri = bill_uri+"/eng@"+bill_status
etree.SubElement(frM, "FRBRthis", {"value": expression_uri+"/main.xml"})
etree.SubElement(frM, "FRBRuri", {"value": expression_uri+".xml"})

etree.SubElement(frM, "FRBRdate", {"date": str(datetime.today().date()), "name": "transformed"})
etree.SubElement(frM, "FRBRauthor", {"href": "#BillsOffice", "as": "editor"})

coverPage = etree.SubElement(bill_elem, "coverPage")

toc = etree.SubElement(coverPage, "toc", {'class': "body"})

for tocItem in text.xpath("./p[starts-with(@style-name, 'toc')][./bookmark-ref]"):
    attrib = {"href": "#"+tocItem.find("./bookmark-ref").attrib['ref-name'],
              "class": tocItem.attrib['style-name'][3:],
             "level": "1"}
    toc_item = etree.SubElement(toc, "tocItem", attrib)
    toc_no = etree.SubElement(toc_item, "inline", {"name": "tocNum"})
    toc_no.text = tocItem.text
    #toc_item.text = None
    toc_hd = etree.SubElement(toc_item, "inline", {"name": "tocHeading"})
    toc_hd.text = tocItem.find("./bookmark-ref").text

act_ref = etree.SubElement(coverPage, "container", {"name": "actsReferredTo"})
#refs = etree.SubElement(act_ref, "content")
act_text = etree.SubElement(act_ref, "p")
act_text.text = "Acts Referred To"
for p in text.xpath("./alphabetical-index[@name='Alphabetical Index1']/index-body/p"):
    p.attrib.pop("style-name")
    ref = re.search("(?P<year>\d{4})\s\(No\.\s(?P<no>\d+)\)$", p.text)
    if ref is not None:
        act_uri = "#act.{}.{}".format(ref.group("year"), ref.group("no"))
        
        ref = etree.SubElement(p, "a", {"href": act_uri})
        ref.text = p.text
        p.text = None
    act_ref.append(p)

preface = etree.SubElement(bill_elem, "preface")
ga_p = etree.SubElement(preface, "p", {"class": "ga_shorttitle"})
ga_shorttitle = etree.SubElement(ga_p, "shortTitle")
ga_shorttitle.text = text.xpath(".//user-field-decl[@name='GA_shorttitle_caps']/@string-value")[0]

en_p = etree.SubElement(preface, "p", {"class": "en_shorttitle"})
en_shorttitle = etree.SubElement(en_p, "shortTitle")
en_shorttitle.text = text.xpath(".//user-field-decl[@name='EN_shorttitle']/@string-value")[0]

ga_stage = etree.SubElement(preface, "p", {"class": "ga_stage"})
ga_docStage = etree.SubElement(ga_stage, "docStage")
ga_docStage.text = text.xpath(".//user-field-decl[@name='GA_DOC_STAGE']/@string-value")[0]

en_stage = etree.SubElement(preface, "p", {"class": "en_stage"})
en_docStage = etree.SubElement(en_stage, "docStage")
en_docStage.text = bill_status

status_p = etree.SubElement(preface, "p", {"class": "status"})
docStatus = etree.SubElement(status_p, "docStatus")
docStatus.text = text.xpath(".//user-field-decl[@name='STATUS']/@string-value")[0]

entitled = etree.SubElement(preface, "p")
entitled.text = text.find("./p[@style-name='entitled']").text

long_title = etree.SubElement(preface, "longTitle")
long_p = text.find("p[@style-name='longtitle']")
for key in long_p.attrib:
    long_p.attrib.pop(key)
long_title.append(long_p)

for p in text.findall("./p"):
    if p.attrib['style-name'] == "enacted":
        break
    else:
        text.remove(p)

preamble = etree.SubElement(bill_elem, "preamble")
enacted = text.find("./p[@style-name='enacted']")
for key in enacted.attrib:
    enacted.attrib.pop(key)
preamble.append(enacted)
text.remove(text.find("user-field-decls"))
text.remove(text.find("alphabetical-index"))
body = etree.SubElement(bill_elem, "body")


para_styles = {p.attrib['name']:p.attrib['parent-style-name'] for p in root.xpath("./automatic-styles/style[@family='paragraph']")}
for p in text.findall("./p[@style-name]"):
    if p.attrib['style-name'] in para_styles:
        p.attrib['style-name'] = para_styles[p.attrib['style-name']]

2015-03-25


In [47]:
with open(bill_interim, "wb") as f:
    f.write(etree.tostring(text, encoding="utf-8", pretty_print=True, xml_declaration=True))

In [48]:
for seq in text.xpath(".//sequence"):
    seq.tag="num"

for top_elem in text.xpath("./p[./bookmark-start]"):
    bookmark = top_elem.find("./bookmark-start")
    
    if top_elem.attrib["style-name"] == "shouldernote":
        num = top_elem.getnext().find("./num")
    else:
        num = top_elem.find("./num")
    tag = num.attrib['name'].replace("number", "")
    tocID = bookmark.attrib.pop("name")
    eId = tag+"-"+num.text
    toc.find("./tocItem[@href='#{}']".format(tocID)).attrib['href'] ="#"+eId
    
    for e in [top_elem, num]:
        for key in e.attrib:
            e.attrib.pop(key)
    container = etree.SubElement(body, tag, {"eId": eId})
    container.append(num)
    heading = etree.SubElement(container, "heading")
    heading.text = " ".join(top_elem.xpath(".//text()"))
    container.append(heading)
    for p in top_elem.xpath("./following-sibling::*"):
            if p.find("./bookmark-start") is not None:
                break
            else:                
                if p.attrib['style-name'] == container.tag:
                    content = etree.SubElement(container, "content")
                    content.append(p)
                    p.tag = "p"
                    for key in p.attrib:
                            p.attrib.pop(key)
                else:  
                    if p.find("./num") is not None:
                        num = p.find("./num")
                        
                        tag = p.attrib.pop('style-name')
                        eId = container.attrib["eId"] #xpath(".//ancestor::*[@eId]/@eId")
                        eId = "{}_{}-{}".format(eId, tag, num.text)
                        subelement = etree.SubElement(container, tag, {"eId": eId} )
                        subelement.append(num)
                        content = etree.SubElement(subelement, "content")
                        content.append(p)
                        
                        
                        num.tail = None
                        for e in [p, num]:
                            for key in e.attrib:
                                e.attrib.pop(key)
                        
                    else:
                        p.attrib['class'] = p.attrib.pop("style-name").replace("5f_", "")
                        content.append(p)

for c in body.xpath(".//*[./content]"):
    content = c.find("./{*}content")
    if content !=c.getchildren()[-1]:
        content.tag = "intro"
        
for a in bill_elem.findall(".//a[@type]"):
    a.attrib.pop("type")
    
etree.strip_tags(body, ["bookmark-start", "bookmark-end", 
                        "soft-page-break", 
                        "alphabetical-index-mark-start", 
                        "alphabetical-index-mark-end",
                       "reference-mark-start",
                       "reference-mark-end"])
for p in body.findall(".//p"):
    if p.text is not None:
        p.text = re.sub("^\.|^\s\(\s|^\.\(", "", p.text)

In [49]:
bill_elem = add_ns_prefix(bill_elem)
AK.append(bill_elem)
xml = etree.tostring(AK, encoding="UTF-8", xml_declaration=True, pretty_print=True)
#try:
#    schema.assertValid(etree.fromstring(xml))
#except etree.DocumentInvalid as e:
#    print(e)
with open(bill_akn, "wb") as f:
    f.write(xml)

In [50]:
with open("bill-text.xml", "wb") as f:
    f.write(etree.tostring(text, encoding="utf-8", pretty_print=True, xml_declaration=True))