In [5]:
!pip install regex



In [6]:
import requests
import requests_cache
import lxml.etree as et
import regex as re
requests_cache.install_cache("./here.sqlite")

re_n = re.compile(r'n="(\d+)."')
re_greek_1 = re.compile(r"""</quote>(\s+)<foreign xml:lang="grc">([^<]+)</foreign>(\s+)<p>""")
re_greek_2 = re.compile("""</p>(\s+)<foreign xml:lang="grc">([^<]+)</foreign>(\s+)<p>""")
re_greek_3 = re.compile("""</p>(\s+)<foreign xml:lang="grc">([^<]+)</foreign>""")

xsl_xml = et.XSLT(et.fromstring("""<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    exclude-result-prefixes="xs"
    version="1.0">
    <xsl:template match="p">
        <div type="textpart" subtype="commentary">
            <xsl:apply-templates/>
        </div>
    </xsl:template>
    <xsl:template match="b">
        <xsl:attribute name="n">
            <xsl:apply-templates/>
        </xsl:attribute>
    </xsl:template>
    <xsl:template match="span[contains(@class,'clsSpanCap')]">
        <quote>
            <xsl:apply-templates/>
        </quote>
    </xsl:template>
    <xsl:template match="span[contains(@class,'clsSpanGRC')]">
        <foreign xml:lang="grc">
            <xsl:apply-templates/>
        </foreign>
    </xsl:template>
    <xsl:template match="p/text()">
        <p><xsl:copy /></p>
    </xsl:template>
    <xsl:template match="section|main">
        <div type="textpart">
            <xsl:attribute name="subtype"><xsl:value-of select="@subtype" /></xsl:attribute>
            <xsl:attribute name="n"><xsl:value-of select="@n" /></xsl:attribute>
            <xsl:apply-templates/>
        </div>
    </xsl:template>
    <xsl:template match="head">
        <xsl:attribute name="n">
            <xsl:apply-templates/>
        </xsl:attribute>
    </xsl:template>
    <xsl:template match="document">
        <body>
            <xsl:apply-templates />
        </body>
    </xsl:template>
</xsl:stylesheet>"""))
uris = {
    
}
for element in range(1, 12):
    uris[element] = requests.get(f"http://horatius.net/index.xps?10.{element}").text

In [7]:
from bs4 import BeautifulSoup

def romanToInt(s):
    roman = {'I':1,'V':5,'X':10,'L':50,'C':100,'D':500,'M':1000,'IV':4,'IX':9,'XL':40,'XC':90,'CD':400,'CM':900}
    i = 0
    num = 0
    while i < len(s):
        if i+1<len(s) and s[i:i+2] in roman:
            num+=roman[s[i:i+2]]
            i+=2
        else:
            #print(i)
            num+=roman[s[i]]
            i+=1
    return num

def map_to_roman(data):
    for el in data:
        if "." in el:
            yield el
        else:
            yield str(romanToInt(el.upper()))

            
from collections import defaultdict

documents = defaultdict(list)

# Iter over relevant resources
for element in range(1, 12):
    # Parse the document
    page1 = BeautifulSoup(uris[element])
    
    # Generate XML which structures as document > (main > ) section > head|p
    false_xml = f"""<document id="{element}">"""
    started_book = False
    mt = None
    for comm in page1.select("body > .clsDivComm"):
        hr = comm.previous_sibling
        clsCap = hr.previous_sibling # a.text = carm. i ii, then p.string = text starting the poem
        head = clsCap.a.text
        head = " ".join(map_to_roman(head.split()))
        if head.startswith("carm. saec. 1"):
            sub = "saec"
            st = "carmen"
            doc = "carmina saecularis"
        elif head.startswith("carm. "):
            main, sub = head.replace("carm. ", "").split()
            mt, st = "book", "carmen"
            doc = "carmina"
        elif head.startswith("serm. "):
            main, sub = head.replace("serm. ", "").split()
            mt, st = "book", "sermo"
            doc = "sermones"
        elif head.startswith("ep. "):
            st, sub = head.split()
            sub = "epode"
            doc = "epodes"
        elif head.startswith("epist. "):
            main, sub = head.replace("epist. ", "").split()
            mt, st = "book", "epistula"
            doc = "epistulae"
        else:
            st, sub = "arsPoetica", "ap"
            doc = "ars.poetica"
            
        if not started_book and mt:
            false_xml += f"""<main subtype="{mt}" n="{main}">"""
            started_book = True
        false_xml += f"""<section subtype="{st}" n="{sub}">"""
        for p in comm.find_all("p"):
            false_xml += str(p)
        false_xml += "</section>"
        
    if started_book:
        false_xml += "</main>"
    false_xml += "</document>"
    
    # Moves th document to TEI
    false_xml = et.tostring(xsl_xml(et.fromstring(false_xml)), encoding=str, pretty_print=True)
    false_xml = false_xml.replace("<p> </p>", "")
    false_xml = false_xml.replace("""<div type="textpart" subtype="commentary"/>""", "")
    false_xml = false_xml.replace("&#xA0;", "")
    false_xml = re_n.sub('n="\g<1>"', false_xml)
    false_xml = re_greek_1.sub('</quote>\g<1><p><foreign xml:lang="grc">\g<2></foreign>', false_xml)
    false_xml = re_greek_2.sub('<foreign xml:lang="grc">\g<2></foreign>', false_xml)
    false_xml = re_greek_3.sub('<p><foreign xml:lang="grc">\g<2></foreign></p>', false_xml)
    false_xml = re.sub("[\]\[]", "", false_xml)
    false_xml = re.sub("\(([^\)]+)\)", "<expan>\g<1></expan>", false_xml)
    false_xml = re.sub("&lt;(\w)&gt;", "<abbr rend=\"macron\">\g<1></abbr>", false_xml)

    documents[doc].append(false_xml)
    

In [8]:
for document in documents:
    with open(f"{document}.xml", "w") as f:
        f.write("".join(documents[document]).replace("</body><body>", ""))