This notebook is responsible for extracting the plain-text of the TLG corpus and filtering texts according to various little filters, **mostly to exclude poetry, fragments and uncertain texts**.

In [2]:
# Documenting
from typing import Generator

# OS
import glob
import os.path

# Data
import csv
import lxml.etree as ET
import pandas
from collections import Counter

# Operations
import regex as re
import unicodedata

# UI
import tqdm

## Constants

In [1]:
#PATH = "/home/thibault/Downloads/comp/Diogenes-Resources/xml/tlg/tlg*.xml"
PATH = "patres/tlg*.xml"
MIN_SIZE = 1000 # Number of first words to take
ForbiddenTGs = {
    "tlg0527", "tlg0031"
}

## XML parsing

In [3]:
def normalize(text: str) -> str:
    return unicodedata.normalize("NFKC", re.sub(r"\s{2,}|\n+", " ", text))

cnt = re.compile("\w+")

NS = {"namespaces": {"t": "http://www.tei-c.org/ns/1.0"}}

XSL = """<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:t="http://www.tei-c.org/ns/1.0"
    exclude-result-prefixes="xs"
    version="1.0">
    <xsl:output method="text"/>
    <xsl:template match="t:TEI">
        <xsl:apply-templates select="//t:body" />
    </xsl:template>
    <xsl:template match="t:body">
        <xsl:apply-templates/>
    </xsl:template>
    <xsl:template match="t:label"/>
    <xsl:template match="t:note"/>
    <xsl:template match="t:head"/>
</xsl:stylesheet>"""

XSL = ET.XSLT(ET.fromstring(XSL))

## Filtering functions

In [4]:
def forbidden_author(string) -> bool:
    string = string.lower()
    if "epistula" in string:
        return True
    if "comment" in string:
        return True
    if "scholia" in string:
        return True
    if "lexic" in string:
        return True
    if "acta" in string:
        return True
    if "fragm" in string:
        return True
    if "corpus" in string:
        return True
    if "anonym" in string:
        return True
    if "pseud" in string:
        return True
    if "testam" in string:
        return True
    if "vita" in string:
        return True
    if string.endswith(" et"):
        return True
    if "evangeli" in string:
        return True
    if "floril" in string:
        return True
    if "anthologia" in string:
        return True
    if "paraphras" in string:
        return True
    if "apocal" in string:
        return True
    if "certamen" in string:
        return True
    if "comica adespota" in string:
        return True
    if "commentaria" in string:
        return True
    if "concilia" in string:
        return True
    if "epica adespota" in string:
        return True
    if "etymologicum" in string:
        return True
    if "epistula ecclesiarum" in string:
        return True
    if "gnologium" in string:
        return True
    if "lyrica adespota" in string:
        return True
    if "proverbi" in string:
        return True
    if "oracula" in string:
        return True
    if "periplus" in string:
        return True
    if "incertus" in string:
        return True
    if "socraticorum epistulae" in string:
        return True
    if "socrat" in string:
        return True
    if "[" in string:
        return True
    if "anonym" in string:
        return True
    if "pseudo" in string:
        return True
    if "ps." in string:
        return True
    if "historia" in string:
        return True
    
def is_poetry(xml):
    return len(xml.xpath("//t:l", **NS)) > 0

def has_fragment(xml):
    return len(xml.xpath("//t:div[@type='fragment']", **NS)) > 0

## Data wrangling functions

In [5]:
def get_tg(filename):
    return os.path.basename(filename)[:7]

def rename_author(author):
    return re.sub(r"(\s+\w+\.)", "", author).replace(" et", "")

## Data accumulation

In [6]:
# Output Data
data = []

# Count the number of files we have per textgroup
tgs = Counter([
    get_tg(file)
    for file in glob.glob(PATH)
])

# Statistics
passed = 0
tgcount = Counter()
ignored_authors = set()
    
for file in tqdm.tqdm(glob.glob(PATH)):
    try:
        xml = ET.parse(file)
        tg = get_tg(file)
        author = str(xml.xpath("/t:TEI/t:teiHeader/t:fileDesc/t:titleStmt/t:author/text()", **NS)[0])
        title = xml.xpath("/t:TEI/t:teiHeader/t:fileDesc/t:titleStmt/t:title/text()", **NS)[0]
        
        if tg in ForbiddenTGs:
            passed += 1
            continue
        elif forbidden_author(author):
            ignored_authors.add(author)
            continue
        elif is_poetry(xml):
            #print(f"Ignoring {file} for poetry reason")
            ignored_authors.add(author)
            continue
        elif has_fragment(xml):
            #print(f"Ignoring {file} for fragment reason")
            ignored_authors.add(author)
            continue
            
        rawtext = normalize(str(XSL(xml))).strip()
        tokens = cnt.findall(rawtext)
        
        if len(tokens) < MIN_SIZE:
            passed += 1
            continue

        data.append({
            "file": os.path.basename(file)[:-4],
            "orig_author": author,
            "author": rename_author(str(author)),
            "title": str(title),
            "textgroup": tg,
            "tokens": len(tokens),
            "full-text-raw": rawtext
        })
        tgcount[tg] += 1
    except Exception as E:
        raise E
        passed += 1
        print(f"Failing on {file}")
        continue
print(passed)
print(ignored_authors)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1654/1654 [00:14<00:00, 117.63it/s]

459
{'Amphilochius Scr. Eccl.', 'Philostorgius Scr. Eccl.', 'Joannes Chrysostomus Scr. Eccl. John Chrysostom', 'Maximus Confessor Theol.', 'Photius Lexicogr., Scr. Eccl. et Theol.', 'Evagrius Scr. Eccl.', 'Arethas Philol. et Scr. Eccl.', 'Theolytus Epic.', 'Theodorus Studites Scr. Eccl. et Theol.'}





## Exporting

In [7]:
df = pandas.DataFrame(data)
print("Before filtering on Title", df.shape)
df = df[~df.title.str.contains("Dub\.|Sp\.|Fragm|Excerpt|(e cod\.)|Suda|recensio|fragm|sp\.|dub\.|(fort\. auctore)|Scholia")]
print("After filtering on Title", df.shape)


df.to_csv("tlg-texts.csv", index=False)
df.head()

Before filtering on Title (1164, 7)
After filtering on Title (700, 7)


  df = df[~df.title.str.contains("Dub\.|Sp\.|Fragm|Excerpt|(e cod\.)|Suda|recensio|fragm|sp\.|dub\.|(fort\. auctore)|Scholia")]


Unnamed: 0,file,orig_author,author,title,textgroup,tokens,full-text-raw
1,tlg2800008,Basilius Scr. Eccl.,Basilius,Homilia in passionem domini,tlg2800,1314,"Μέγα μὲν οὐρανὸς δημιούργημα, καὶ τῆς ὁρωμένης..."
2,tlg2040052,Basilius Theol.,Basilius,De baptismo libri duo,tlg2040,20790,"Ὁ Κύριος ἡμῶν Ἰησοῦς Χριστὸς, ὁ μονογενὴς Υἱὸς..."
3,tlg2017048,Gregorius Nyssenus Theol.,Gregorius Nyssenus,In Basilium fratrem,tlg2017,6134,Καλὴν ἐπέθηκεν ὁ Θεὸς τὴν τάξιν ταῖς ἐτησίοις ...
4,tlg2022060,Gregorius Nazianzenus Theol.,Gregorius Nazianzenus,Carmina moralia,tlg2022,37978,"Παρθενίην στεφάνοις ἀναδήσομεν ἡμετέροισιν, Ἐκ..."
5,tlg4138090,Ephraem Syrus Theol.,Ephraem Syrus,Sermo de uirginitate,tlg4138,1690,Περὶ παρθενίας καὶ ἁγιασμοῦ ψυχῆς Παῦλος ἀπόστ...


In [8]:
for x in sorted(df.orig_author.unique()):
    print(x)

Adamantius Theol.
Alexander Scr. Eccl.
Alexander Theol.
Amphilochius Scr. Eccl.
Antonius Hagiographus Scr. Eccl.
Asterius Scr. Eccl.
Asterius Sophista Scr. Eccl.
Athanasius Theol.
Barlaam Math., Theol. et Epist.
Basilius Med. et Scr. Eccl.
Basilius Scr. Eccl.
Basilius Theol.
Clemens Alexandrinus Theol.
Clemens Romanus Theol. et Clementina
Cyrillus Scr. Eccl.
Cyrillus Theol.
Didymus Caecus Scr. Eccl. Didymus the Blind
Dio Chrysostomus Soph.
Ephraem Scr. Eccl.
Ephraem Syrus Theol.
Epiphanius Scr. Eccl.
Eusebius Scr. Eccl. et Theol.
Eustathius Philol. et Scr. Eccl.
Eustathius Scr. Eccl. et Theol.
Evagrius Scr. Eccl.
Flavius Justinianus Imperator Theol.
Gregorius Nazianzenus Theol.
Gregorius Nyssenus Theol.
Gregorius Thaumaturgus Scr. Eccl.
Hesychius Scr. Eccl.
Hippolytus Scr. Eccl.
Irenaeus Theol.
Joannes Chrysostomus Scr. Eccl. John Chrysostom
Joannes Damascenus Scr. Eccl. et Theol. John of Damascus
Joannes Gramm. et Theol.
Joannes Theol.
Julianus Scr. Eccl.
Leontius Scr. Eccl.
Marcellus

## Lemmatization

Requires to have set-up `bert-env` virtual environment.

In [10]:
!./bert-env/bin/python tag-in-xml.py tlg-texts.csv

  texts = texts[~texts.title.str.contains("Dub\.|Sp\.|Fragm|Excerpt|(e cod\.)|Suda|recensio|fragm|sp\.|dub\.|(fort\. auctore)|Scholia")]
2023-07-25 14:44:49,741 loading file final-model.pt
2023-07-25 14:44:53,285 SequenceTagger predicts: Dictionary with 1030 tags: <unk>, O, a-p---na-, v2spma---, u--------, d--------, v-papamn-, r--------, l-s---ma-, n-s---ma-, v3siie---, l-s---nn-, l-s---fg-, n-s---fg-, l-s---mg-, n-s---mg-, v3ppia---, i--------, n-s---mn-, v3saia---, p-p---fd-, v-sppamn-, a-s---mn-, n-p---mg-, c--------, v3saoa---, p-s---mn-, l-s---mn-, v3siia---, v-sapamg-, b--------, p-s---cg-, p-s---fd-, l-p---mg-, a-p---mg-, a-s---ma-, v-sppamg-, v3spia---, a-p---ng-, n-p---ng-, _, v3piie---, l-p---md-, a-p---md-, v-pppamn-, p-p---ma-, l-s---fa-, n-s---fa-, n-p---na-, v3paia---
0it [00:00, ?it/s]Passing tlg2062098
Passing tlg2063005
Passing tlg4138086
Passing tlg2042022
Passing tlg2797035
Passing tlg2041004
Passing tlg4138077
Passing tlg4138056
Passing tlg4138094
Passing tlg202205

Passing tlg2017026
Passing tlg2062004
Passing tlg2035059
Passing tlg2035041
Passing tlg4138026
Passing tlg2062144
Passing tlg2022027
Passing tlg4089020
Passing tlg2115033
Passing tlg2062025
Passing tlg4110020
Passing tlg2062008
Passing tlg2062086
Passing tlg2021033
Passing tlg2035011
Passing tlg4089005
Passing tlg2062069
Passing tlg2062030
Passing tlg4138134
Passing tlg4138008
Passing tlg2017041
Passing tlg2892112
Passing tlg2017024
Passing tlg2062007
Passing tlg2022031
Passing tlg2062378
Passing tlg0555006
Passing tlg2063001
Passing tlg2017009
Passing tlg2734020
Passing tlg2734005
Passing tlg1271001
Passing tlg2115036
Passing tlg4138073
Passing tlg2115003
Passing tlg2018017
Passing tlg4138031
Passing tlg2022048
Passing tlg2934006
Passing tlg2112011
Passing tlg4138005
Passing tlg2022059
Passing tlg2892049
Passing tlg4138100
Passing tlg2062060
Passing tlg2018028
Passing tlg4089021
Passing tlg2102005
Passing tlg2022062
Passing tlg4138133
