In [1]:
def msa2str(msa):
    #this map will be used to annotate part of speech from the menota msa attribute on a word <w> element
    msamap={'xNC': 'n',
            'xNP':'N',
            'xAJ':'aj',
            'xPE':'pr',
            'xPQ':'?',
            'xDP':'d''',
            'xDD':'d',
            'xPD':'pd',
            'xVB fF':'fv',
            'xVB fI':'∞',
            'xVB fP':'vp',
            'xAV':'av',
            'xAR':'ar',
            'xAP':'→',
            'xCC':'ccj',
            'xCS':'scj',
            'xIT':'ix',
            'xIM':'+∞',
            'xRP':'◦',
            'xNX':'-',
            'xPX':'<',
            'xVX fF':'aux'}
    try:
        return msamap[msa]
    except:
        return ""



In [2]:
def getHTMLtemplate():  #here's the html template. CSS will go in the stle element, contents will go in the body element, etc.

    template=["""<!DOCTYPE html>
                    <html>
                        <head>
                            <meta charset="utf-8">
                            <style>
                            """,
        """                 </style>
                            <title>""","""</title>
                        </head>
                        <body>
                            <div id="header">""",
                                """<div class="toggles">
                                    <h2>Toggle annotation</h2>
                                    <div class="toggle" data-target="break"><span class="break">ᚦ</span>&nbsp;syllable breaks</div>
                                    <div class="toggle" data-target="weight"><span class="weight">ᚦ</span>&nbsp;syllable weight</div>
                                    <div class="toggle" data-target="parts"><span class="parts">ᚦ</span>&nbsp;part of speech</div>
                                    <div class="toggle" data-target="wclass"><span class="wclass">ᚦ</span>&nbsp;word class</div>
                                    <div class="toggle" data-target="alliteration1"><span class="alliteration1">ᚦ</span>&nbsp;alliterative potential (high)</div>
                                    <div class="toggle" data-target="alliteration2"><span class="alliteration2">ᚦ</span>&nbsp;alliterative potential (all)</div>
                                    <div class="toggle" data-target="punctuation"><span class="punctuation">ᚦ</span>&nbsp;punctuation</div>
                                </div>
                            </div>
                            <div id="content">
                    """,

                    """ 
                            </div>
                            <script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>

                            <!-- the following attaches a function to the on-click event of the toggles that adds or
                                 removes a class corresponding to the toggle to the body element. These body 
                                 classes are referenced in the relevant CSS to turn on and off various annotations.
                            -->
                            <script>
                            $(function() {
                                $(".toggle").on("click", function() {
                                $("body").toggleClass($(this).data("target"));
                                return false;
                                });
                            });
                            </script>
                        </body>
                    </html>"""
    ]
    return template


In [3]:
def getStyle(language="OE"):    #css code for the HTML output
    if not language in ["OE","ON","OS","OHG"]: language="OE"
    
    style="""   
                            span     {display: inline-block; margin: 0}
                            img      {float:right; width:400px; padding-right: 40px}
                            p        {padding-left: 30px}
                            p.info   {padding-left: 10px; font-size:80%; color: CadetBlue; float:right; display:block}
                            .page    {color:LightSkyBlue; padding-right:4px; padding-left: 10px}
                            .line    {color:CadetBlue; width:40px; display: inline-block}
                            .prose   {color:LightSeaGreen}
                            .punc    {color:FireBrick; padding-left:2px; padding-right:2px; display:none}
                            .word    {color:Black}
                            .allit1  {color:FireBrick; font-size:50%; vertical-align: super; display:none}
                            .allit2  {color:FireBrick; font-size:50%; vertical-align: super; display:none}
                            .msa     {color:CadetBlue; font-size:60%; vertical-align: sub; display:none}
                            .syll    {color:DimGrey}
                            .grouper {color:Indigo}
                            .separator {color:LightGrey ;font-size:60%; display:none}

                            /* toggle effects */
                            body.break  .separator       {display:inline-block}
                            body.weight .syll.l          {color:LightBlue}
                            body.weight .syll.h          {color:DimGrey}
                            body.weight .syll.o          {color:SteelBlue}
                            body.parts  .msa             {display:inline-block}
                            body.alliteration1  .allit1  {display:inline-block}
                            body.alliteration2  .allit1  {display:inline-block}
                            body.alliteration2  .allit2  {display:inline-block}
                            body.punctuation   .punc     {display:inline-block}

                            /* header and toggles */
                            div#header  {
                                        position: fixed;
                                        top:0px;
                                        left: 0px;
                                        width: 100%;
                                        background-color: WhiteSmoke;
                                        overflow: hidden;
                                        }
                            div#title   {
                                        color: SteelBlue;
                                        padding-left: 20px;
                                        font-size:40pt;
                                        float:left
                                        }
                            span#subtitle {
                                        padding-left: 10px;
                                        color:lightSkyBlue;
                                        font-size: 24pt;
                            }
                            div#source  {
                                        color:LightSkyBlue;
                                        padding-left: 20px;
                                        padding-top: 10px;
                                        font-size: 16pt;
                                        position: absolute;
                                        
                                        top: 50px;
                                        }
                            div#content { height: 100%; padding-top: 140px; overflow: auto}
                            div.toggles {
                                        float: right;
                                        display: inline-block;
                                        background-color: WhiteSmoke;
                                        padding: 10px 20px;
                                        font-family: "Noto Sans", sans-serif;
                                        font-size: 10pt;
                                        color: DarkSlateGrey;
                                        }
                            div.toggles span { color:LightGrey}
                            div.toggles h2 {
                                            margin: 0;
                                            font-size: 10pt;
                                            color: LightSkyBlue;
                                        }
                            div.toggle {
                                            cursor: pointer;
                                        }
                            div.toggle:hover {
                                            color: Grey;
                                        }
                            body.break div.toggle span.break,
                            body.weight div.toggle > span.weight,
                            body.parts div.toggle > span.parts,
                            body.wclass div.toggle > span.wclass,
                            body.alliteration1 div.toggle > span.alliteration1,
                            body.alliteration2 div.toggle > span.alliteration2,
                            body.punctuation div.toggle > span.punctuation
                            {
                                color: DarkSlateGrey;
                            }
        """
    return style

In [6]:
def toHTML(filepath=''):
    from lxml import etree as ET
    import tkinter as tk
    from tkinter import filedialog
    
    window = tk.Tk()
    window.withdraw()

    #get an initial-state XML document
    filepath = filedialog.askopenfilename()
    # filepath = "/Users/peter/Desktop/Signum/Thesis/Misc/scratch_out.xml"

    tree=ET.parse(filepath)
    root=tree.getroot()

    #get the css
    style=getStyle()
    
    contents="" #this will be built up into the full body element of the html output
    
    #produce the title and header
    header=""
    title=root.findtext("./teiHeader/fileDesc/titleStmt/title")
    if title == None: title="TEIerror"
    title=title.strip()

    subtitle=root.findtext("./teiHeader/fileDesc/titleStmt/subtitle")
    if subtitle == None: subtitle="TEIerror"
    subtitle=subtitle.strip()

    source=root.findtext("./teiHeader/fileDesc/sourceDesc/p")
    if source == None: source="TEIerror"
    source=source.strip()

    header += '<div id="title">' + title + '<span id="subtitle">' + subtitle + '</div>'
    header += '<div id="source">' + source + '</div>'

    #one place to store any data that needs to persist over iterations of the loop
    loopdata={'prefix': False, 'compound': False, 'complex':False, 'syllRemain':0}
    #start at the page level
    pages=root.findall('pb')
    for page in pages:

        #now do something with the page
        contents += renderPage(page, loopdata)
        pnum=page.get('n')
        #get all the following lb's before the next pb
        predicate = './following-sibling::lb[preceding-sibling::pb[1][@n="' + pnum + '"]]'
        find_lines = ET.XPath(predicate)
        lines=find_lines(page)
        if len(lines)>0:
            for line in lines:
                #do something with the line itself
                contents += renderLine(line, loopdata)
                #the line has everything of interest: words (with syllables) and punctuation 
                lnum=line.get('n')
                if lnum == None: lnum="0"
                #we are going to find all the nodes after this line beginning tag and before the next one
                predicate='./following-sibling::*[preceding-sibling::lb[1][@n="' + lnum + '"]]'
                get_nodes=ET.XPath(predicate)
                nodes = get_nodes(line)
                
                #start the main loop through the document
                for node in nodes:
                    if node.tag=='p':
                         contents +=renderProse(node, loopdata)
                    elif node.tag=='pc':   #punctuation
                        contents +=renderPunctuation(node, loopdata)
                    elif node.tag=='cb':   #clause beginning
                        contents +=renderClause(node, loopdata)
                    elif node.tag=='w':  #a word is where all the fun starts
                        word=node
                        try:
                            loopdata['prefix']=int(word.get('p'))
                        except:
                            loopdata['prefix']=0
                        try:
                            loopdata['compound']=int(word.get('c'))
                        except:
                            loopdata['compound']=0
                        contents += renderWord(word, loopdata)

                        #get the syllable count
                        try:
                            syllables=int(word.get("Σ"))
                        except:
                            syllables=0
                        loopdata['syllRemain']=syllables #the number of syllables in the word yet to be rendered
                        
                    elif node.tag=='s': #syllable
                        contents += renderSyllable(node, loopdata)
    
    #specify an output html file
    # file_path = filedialog.askopenfilename()
    filepath = '/Users/peter/Desktop/Signum/Thesis/Misc/' + title + '.html'
    template=getHTMLtemplate()
    if len(filepath)>0:
        fout=open(filepath, "w")

        template=getHTMLtemplate() #the template is a list of HTML fragments to be assembled here with the contents
        HTML=template[0] + style + template[1] + title + template[2] + header + template[3] + contents + template[4]

        fout.write(HTML)
        fout.close
    return 0

#the following functions return html for each kind of object in the document
def renderPage(page, loopdata):
    contents=""
    pnum=page.get('n')
    #pull in an image of the manuscript page
    path = 'file:///Users/peter/Desktop/Signum/Thesis/Misc/resources/' + pnum
    contents += '<p class="page">' + pnum + '</p>'
    contents += '<img src="' + path + '.png"/>'
    return contents

def renderLine(line, loopdata):
    contents=""
    lnum = "0" #line number
    try:
        lnum=line.get('n')
    except:
        lnum = "0"
    try:
        lnum = str(lnum.split(';')[1])
    except:
        lnum = "0"
    finally:
        contents += "<p/>"
        contents += '<span class="line">' + lnum + '</span>'
    return contents

def renderProse(prose, loopdata):
    contents = '<span class="prose">' + prose.text +'</span> '
    return contents

def renderPunctuation(punc, loopdata):
    contents = '<span class="punc">' + punc.text +'</span> '
    return contents

def renderClause(clause, loopdata):
    contents = '<span class="clause">§</span>'
    return contents

def renderWord(word, loopdata):
    contents=""
    loopdata["wc"]=""
    wc="" #word class
    loopdata["msa"]=""
    msa=""

    if "wc" in word.attrib:
        wc=word.get("wc")
        loopdata["wc"]=wc

    msa=word.get("msa")
    #wait to put this after the last syllable of the word
    if msa != None:
        loopdata['msa']='<span class="msa">' + msa2str(msa) + '</span>'
    
    loopdata["firstSyll"]=True
    
    return contents 

def renderSyllable(syll, loopdata):
    contents=""
    allit=""

    if loopdata["firstSyll"]:
        #annotate the alliterative potential
        if "A" in syll.attrib:
            allit=syll.get("A")
            if loopdata["wc"]=="s":
                contents +='<span class="allit1">' + allit + '</span>'
            else:
                contents +='<span class="allit2">' + allit + '</span>'

    #get the syllable weight from the 'wt' attribute if present
    if syll.text != None:
        if "wt" in syll.attrib:
            weight=syll.get("wt")
        else:
            weight=""
        contents += '<span class="syll ' + weight + '">' + syll.text + '</span>'

        loopdata['syllRemain'] -= 1
        if loopdata['syllRemain']>0:
            contents += '<span class="separator">·</span>'
        #last syllable of the word
        else:
            contents +=loopdata["msa"]
            if loopdata['prefix']==1 or loopdata['compound']==1:
                contents += '-'
            else:
                contents += ' '
    loopdata["firstSyll"]=False
    return contents

print(toHTML())

0
