# Turn one of the ConDÉ books into a PDF file

This script goes through a conversion in HTML format to edit the contents. The HTML code is then transformed into a PDF using the [WeasyPrint](https://weasyprint.org/) package.

[WeasyPrint Python manual](https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#python-library)

### Imports and file variables

In [1]:
import lxml.etree as ET
import lxml.html as HT
from weasyprint import HTML, CSS
import re


# This is the TEI namespace declaration.
# Since it is the main namespace, no prefix is added.
ET.register_namespace("tei","http://www.tei-c.org/ns/1.0")
ET.register_namespace("xml","http://www.w3.org/XML/1998/namespace")

facsRE = re.compile("facs_\d+")

filename = "terrien"

### Define CSS

In [2]:
# I will write CSS at the end.
# css = ""

### FUNCTION: extract text from tei:w element

In [30]:
def get_w_text(word):
    
    """
    Function taking a <tei:w> element and
    returning its compiled textual content.
    
    :param word: ET.Element('{http://www.tei-c.org/ns/1.0}w')
    
    """
    
    # Preparing the return string as an empty string.
    texte = ""
    
    # If there is text directly inside <w> element and
    # before the first child, add it.
    if word.text:
        texte += str(word.text)
                
    # Loop on all current <w> children.
    for item in word:
            
        # If current child is <tei:height> or <tei:supplied>
        if item.tag == '{http://www.tei-c.org/ns/1.0}height' or item.tag == '{http://www.tei-c.org/ns/1.0}supplied':
            # Add text.
            texte += str(item.text)
            # If any, add the text following current child.
            if item.tail:
                texte += str(item.tail)
                
        # If current child is <tei:lb>, add the following text.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}lb':
            if item.tail:
                texte += str(item.tail)
                        
        # If current child is <tei:choice>, add the second child of <choice>
        # (<tei:reg> or <tei:expan>), then add the text following current child if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}choice':
            texte += str(item[1].text)
            if item.tail:
                texte += str(item.tail)
        
        # If current child is <tei:c>, add its text, then the following text if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}c':
            texte += item.text
            if item.tail:
                texte += str(item.tail)
        
        
        # If current child is <tei:hi>, add its text, then the following text if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}hi':
            texte += item.text
            if item.tail:
                texte += item.tail
        
        # If current child is <tei:add>, loop on its children and do the same checks.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}add':
            # On refait tous les tests.
            if item.find('.') == None :
                texte = str(item.text)
                            
            else:
                        
                if item.text:
                    texte += str(item.text)
                        
                for subitem in item:
                    if subitem.tag == '{http://www.tei-c.org/ns/1.0}lb':
                        if subitem.tail:
                            texte += str(subitem.tail)
                    elif subitem.tag == '{http://www.tei-c.org/ns/1.0}choice':
                        texte += str(subitem[1].text)
                        if subitem.tail:
                            texte += str(subitem.tail)
                            
    return texte

### FUNCTION: Make title string

In [31]:
def title_str(div):
    
    """
    Function taking a <tei:div> element with lemmatized text
    and returning its title, if any.
    
    :param div: ET.Element('{http://www.tei-c.org/ns/1.0}div')
    :param dcount: integer
    
    """
    
    # Lists of characters to be treated particularly.
    noLspace = ",.)/]-'"
    noRspace = "(/[]-'"
    # insecable = ";:"
    
    # List of strings to be filled.
    divlist = []
    
    try:
        # If you do find a title as first child of div, make its text.
        if div.find('./*[1]').tag == "{http://www.tei-c.org/ns/1.0}head":
            
            # Loop on each <tei:w> word token.
            for word in div.findall('./{http://www.tei-c.org/ns/1.0}head/{http://www.tei-c.org/ns/1.0}w'):
                
                # Compile the text of current <tei:w> element.
                wtxt = get_w_text(word)
                
                # If the list is empty, add the current word to the list.
                if len(divlist) == 0:
                    divlist.append(wtxt)

                # If the token is a punctuation character which
                # is not separated from the previous word by a space,
                # add it to the last entry in the list.
                elif wtxt in noLspace:
                    divlist[-1] += wtxt
                
                # If the last entry in the list is a character which
                # is not separated from the next word by a space,
                # add the current token to it.
                elif divlist[-1] in noRspace:
                    divlist[-1] += wtxt

                # If the last letter in the last entry in the list is
                # a character which is not separated from the next word
                # by a space, add the current token to it.
                elif divlist[-1][-1] in noRspace:
                    divlist[-1] += wtxt

                #elif wtxt in insecable:
                #    divlist[-1] += "\u00a0"
                #    divlist[-1] += wtxt
                
                # Otherwise, just add the token as a new list entry.
                else:
                    divlist.append(wtxt)
            
            # Once you have treated every token in the title, make the
            # return string by adding a space between each list entry.
            title = " ".join(divlist)
        
        # If there is no title to the div but it has an @subtype,
        # its value makes the return string.
        elif div.get('subtype') != None:
            title = "[" + div.get('subtype').capitalize() + "]"
            
        elif div.get('type') not in ['part','chapter','section']:
            title = "[" + div.get('type').capitalize() + "]"
        
        else:
            title = "[Aucun titre]"
    
    # Just a marker to spot errors within final output.
    except Exception as e:
        print(e, "-> Could not construct string for: "+ ET.tostring(word).decode('utf-8') + " in " + div.get('{http://www.w3.org/XML/1998/namespace}id'))
        title = "Pas réussi."
    
    return title

### Parse the entry TEI-XML file

If the document is linked to a IIIF file, the script will retrieve the title page image(s) under the variable: `img_address`. The image will then be used as a cover.

In [32]:
doc = ET.Element("html")

# Here add the md as <head>

body = ET.SubElement(doc, "body")

with open(f"../base-version/{filename}_base.xml") as readxml:
    tree = ET.parse(readxml)
    root = tree.getroot()
    
    sourceDesc = root.find("./{http://www.tei-c.org/ns/1.0}teiHeader/{http://www.tei-c.org/ns/1.0}fileDesc/{http://www.tei-c.org/ns/1.0}sourceDesc/{http://www.tei-c.org/ns/1.0}bibl[@{http://www.w3.org/XML/1998/namespace}id='temoin']")
    iiif = False
    
    # Get the title pages if available.
    if len(root.findall('.//{http://www.tei-c.org/ns/1.0}titlePage')) != 0 and "iiif" in sourceDesc.get("source") :
        iiif = True
        covers = ET.SubElement(body, "div", attrib = {"id":"covers"})
        for titlep in root.findall('.//{http://www.tei-c.org/ns/1.0}titlePage'):
            facs = titlep.find(".//*[@facs]").get("facs").replace("#","")
            zoneid = re.search(facsRE, facs).group()            
            gotimage = False            
            for surface in root.findall('.//{http://www.tei-c.org/ns/1.0}surface'):
                while gotimage == False:
                    if surface.get("{http://www.w3.org/XML/1998/namespace}id") == zoneid:
                        covers.append(ET.Element("img", attrib={"src":surface.get("source"), "alt":"Couverture de la source papier."}))
                        gotimage = True
                else:
                    break
                    
        
    # Build the readable metadata.
    
    humanMD = ET.SubElement(body, "div", attrib = {"id":"md"})
    humanMDtitle = ET.SubElement(humanMD, "h1")
    humanMDtitle.text = "À propos"
    
    fileDesc = root.find('./{http://www.tei-c.org/ns/1.0}teiHeader/{http://www.tei-c.org/ns/1.0}fileDesc')
    
    # ESSENTIALS = title page
    
    essentials = ET.SubElement(humanMD,"div", attrib={"id":"identification"})
    
    baseTitle = ET.SubElement(essentials,"h2")
    baseTitle.text = fileDesc.find("./{http://www.tei-c.org/ns/1.0}titleStmt/{http://www.tei-c.org/ns/1.0}title").text
    
    author = ET.SubElement(essentials,"h4")
    author.text = sourceDesc.find("./{http://www.tei-c.org/ns/1.0}author/{http://www.tei-c.org/ns/1.0}persName").text + " - "
    
    dates = ET.SubElement(essentials, "h4")
    dates.text = sourceDesc.find('./{http://www.tei-c.org/ns/1.0}pubPlace').text + ", " + sourceDesc.find("./{http://www.tei-c.org/ns/1.0}date").get("when")
    
    description = ET.SubElement(essentials, "p")
    description.text = "Transcription numérique d'après le document au format XML-TEI :"
    
    edition = ET.SubElement(description, "cite")
    edition.text = fileDesc.find('./{http://www.tei-c.org/ns/1.0}editionStmt/{http://www.tei-c.org/ns/1.0}edition').text
    
    scq = ET.SubElement(essentials, "p")
    scq.text = "Responsables du document XML-TEI, dans le cadre du projet RIN CONDÉ, financé par la Région Normandie (2019-2021) :"
    scqlist = ET.SubElement(essentials,"ul")
    
    for actor in fileDesc.findall("./{http://www.tei-c.org/ns/1.0}editionStmt/{http://www.tei-c.org/ns/1.0}respStmt"):
        pers = ET.Element("b")
        pers.text = actor.find('./{http://www.tei-c.org/ns/1.0}persName').text
        org = actor.find('./{http://www.tei-c.org/ns/1.0}orgName').text + actor.find('./{http://www.tei-c.org/ns/1.0}orgName/{http://www.tei-c.org/ns/1.0}orgName').text
        resp = actor.find('./{http://www.tei-c.org/ns/1.0}resp[1]')
        if resp.get('from') != None:
            rfrom = resp.get('from')
            rto = resp.get('to')
            from_to = f" (entre {rfrom} et {rto})"
        else:
            from_to = ""
            
        pers.tail = f"{from_to}, {resp.text.lower()}. {org}"
        total = ET.SubElement(scqlist, "li")
        total.append(pers)
     
    net = ET.SubElement(essentials, "p")
    net.text = "Le fichier d'origine est disponible :"
    netlist = ET.SubElement(essentials, "ul")
    github = ET.SubElement(netlist, "li")
    github.text = f"Sur GitHub, à l'adresse : www.github.com/RIN-ConDE/editions/base-version/{filename}_base.xml ."
    transfo = ET.SubElement(essentials, "p")
    transfo.text = "La transformation permettant de générer le présent fichier PDF est disponible et documentée sur GitHub à l'adresse : www.github.com/RIN-ConDE/editions/tools/base-to-pdf.ipynb"
    
    # PUBINFO = details on publication
    
    pubinfo = ET.SubElement(humanMD,"div", attrib={"id":"publication"})
    
    pititle = ET.SubElement(pubinfo,"h2")
    pititle.text = "Sur la constitution et l'utilisation du présent document"
    
    droits = ET.SubElement(pubinfo, "div", attrib={"id":"uses"})
    droitstitle = ET.SubElement(droits, "")
    droitstitle.text = "Conditions d'utilisation"
    
    availability = ET.SubElement(droits, "p")
    avtext = fileDesc.find('./{http://www.tei-c.org/ns/1.0}publicationStmt/{http://www.tei-c.org/ns/1.0}availability/{http://www.tei-c.org/ns/1.0}licence/{http://www.tei-c.org/ns/1.0}p[@{http://www.w3.org/XML/1998/namespace}lang="fr"]').text
    avlink = fileDesc.find('./{http://www.tei-c.org/ns/1.0}publicationStmt/{http://www.tei-c.org/ns/1.0}availability/{http://www.tei-c.org/ns/1.0}licence')
    availability.text = avtext + " - " + avlink.get("target")
    
    temoin = ET.SubElement(pubinfo,"div", attrib={"id":"temoin"})
    temointitle = ET.SubElement(temoin, "")
    temointitle.text = "La source matérielle"
    
    instit = sourceDesc.find('./{http://www.tei-c.org/ns/1.0}objectIdentifier/{http://www.tei-c.org/ns/1.0}institution')
    ou = ET.SubElement(temoin, "p")
    ou.text = f"Le témoin source est conservé à la {instit.find('./{http://www.tei-c.org/ns/1.0}orgName').text}, {instit.find('./{http://www.tei-c.org/ns/1.0}location/{http://www.tei-c.org/ns/1.0}settlement').text}, {instit.find('./{http://www.tei-c.org/ns/1.0}location/{http://www.tei-c.org/ns/1.0}country').text}, sous la cote : {sourceDesc.find('./{http://www.tei-c.org/ns/1.0}objectIdentifier/{http://www.tei-c.org/ns/1.0}idno').text}. Extrait de la notice :"
    notice = ET.SubElement(temoin, "blockquote")
    
    for p in sourceDesc.findall('.//{http://www.tei-c.org/ns/1.0}note/{http://www.tei-c.org/ns/1.0}quote/*'):
        parag = ET.SubElement(notice,"p")
        parag.text = p.text
        
    if iiif == True:
        iiif = ET.SubElement(temoin, "p")
        sourceatts = sourceDesc.get("source").split(" ")
        sourcedict = {'manifeste':[], 'catalogue':[], 'gallica':[], 'iiif':[], 'others':[]}
        for sourceval in sourceatts:
            if "iiif" in sourceval and "manifest" in sourceval:
                sourcedict['manifeste'].append(sourceval)
            elif "iiif" in sourceval:
                sourcedict['iiif'].append(sourceval)
            elif 'catalogue' in sourceval:
                sourcedict['catalogue'].append(sourceval)
            elif 'gallica' in sourceval:
                sourcedict['gallica'].append(sourceval)
            else:
                sourcedict['others'].append(sourceval)
        
        iiif.text = f"Les images sont disponibles selon le protocole IIIF. Voir les liens suivants :"
        
        iiifL = ET.SubElement(iiif, 'ul')

        for sourcelink in sourcedict['manifeste']:
            iiiflink = ET.Element('li')
            manifeste = ET.SubElement(iiiflink, 'a')
            manifeste.set('href', sourcelink)
            manifeste.text = "Manifeste IIIF"
            manifeste.tail = " : " + sourcelink
            iiifL.append(iiiflink)
            del iiiflink
            
        for sourcelink in sourcedict['iiif']:
            iiiflink = ET.Element('li')
            manifeste = ET.SubElement(iiiflink, 'a')
            manifeste.set('href', sourcelink)
            manifeste.text = "Lien IIIF"
            manifeste.tail = " : " + sourcelink
            iiifL.append(iiiflink)
            del iiiflink
            
        for sourcelink in sourcedict['gallica']:
            iiiflink = ET.Element('li')
            manifeste = ET.SubElement(iiiflink, 'a')
            manifeste.set('href', sourcelink)
            manifeste.text = "Facsimile numérique"
            manifeste.tail = " : " + sourcelink
            iiifL.append(iiiflink)
            del iiiflink
            
        for sourcelink in sourcedict['catalogue']:
            iiiflink = ET.Element('li')
            manifeste = ET.SubElement(iiiflink, 'a')
            manifeste.set('href', sourcelink)
            manifeste.text = "Notice du catalogue"
            manifeste.tail = " : " + sourcelink
            iiifL.append(iiiflink)
            del iiiflink
            
        
        for sourcelink in sourcedict['others']:
            iiiflink = ET.Element('li')
            manifeste = ET.SubElement(iiiflink, 'a')
            manifeste.set('href', sourcelink)
            manifeste.text = "Autre source utile"
            manifeste.tail = " : " + sourcelink
            iiifL.append(iiiflink)
            del iiiflink
    
    process = ET.SubElement(pubinfo, "div", attrib={"id":"process"})
    processtitle = ET.SubElement(process, "")
    processtitle.text = "Processus d'encodage"
    
    for processparag in root.findall('./{http://www.tei-c.org/ns/1.0}teiHeader/{http://www.tei-c.org/ns/1.0}encodingDesc/*[@{http://www.w3.org/XML/1998/namespace}lang="fr"]/{http://www.tei-c.org/ns/1.0}p'):
        thisp = ET.Element("p")
        thisp.text = processparag.text
        for pchild in processparag.findall('./*'):
            if pchild.tag == "{http://www.tei-c.org/ns/1.0}ref":
                if pchild.get('target') != None:
                    link = pchild.get('target')
                elif pchild.get('corresp') != None:
                    link = pchild.get('corresp')
                ahref = ET.Element("a")
                ahref.set("href", link)
                ahref.text = pchild.text
                if pchild.tail:
                    ahref.tail = pchild.tail
                thisp.append(ahref)
            else:
                thisp.text.append(" " + pchild.text + " ")
        process.append(thisp)
        del thisp

    # TEXT CONTENTS
    
    textcontent = ET.SubElement(body, "div", attrib = {"id":"textcontent"})
    texttitle = ET.SubElement(textcontent, "h1")
    texttitle.text = fileDesc.find('./{http://www.tei-c.org/ns/1.0}sourceDesc/{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title[@type="main"]')
    
    # FRONT MATTER
    
    frontmatter = ET.SubElement(textcontent, "div", attrib = {"id":"front-matter"})
    frontmattertitle = ET.SubElement(frontmatter, "h2")
    frontmattertitle.text = "Matière liminaire"
    front = root.find("./{http://www.tei-c.org/ns/1.0}text/{http://www.tei-c.org/ns/1.0}front")
    
    # MAIN BODY
    
    mainbody = ET.SubElement(textcontent, "div", attrib = {"id":"main-body"})
    mainbodytitle = ET.SubElement(mainbody, "h2")
    mainbodytitle.text = "Corps du texte"
    corps = root.find("./{http://www.tei-c.org/ns/1.0}text/{http://www.tei-c.org/ns/1.0}body")
    
    
    # BACK MATTER
    
    backmatter = ET.SubElement(textcontent, "div", attrib = {"id":"back-matter"})
    backmattertitle = ET.SubElement(backmatter, "h2")
    backmattertitle.text = "Annexes"
    annexes = root.find("./{http://www.tei-c.org/ns/1.0}text/{http://www.tei-c.org/ns/1.0}back")
    
    
    