In [1]:
import os, glob, re, json
from lxml import etree
import pandas as pd

In [2]:
def clean(text):
    """Remove superfluous spaces and linebreaks from extracted text"""
    cleaned = re.sub("\n","",text)
    cleaned = re.sub("\t","",cleaned)
    cleaned = re.sub("\\s{2,}"," ",cleaned)
    cleaned = re.sub(r'’',r'ʼ', cleaned) # change Apostrophe from U+2019 to U+02BC for better compatability with grc
    return cleaned

## Plaintext

In [183]:
def extract_plaintext_from_tei(file_path, exclude_tags):
    """
    Extracts plaintext from a TEI XML file, specifically from the <text> element,
    excluding specified elements and their text.

    Args:
        file_path (str): The path to the TEI XML file.
        exclude_tags (list): A list of tag names to exclude (e.g., ['note', 'stage']).

    Returns:
        str: The extracted plaintext from the <text> element, excluding specified elements.
    """
    try:
        # Parse the TEI XML file
        tree = etree.parse(file_path)
        
        # Define the TEI namespace
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}  # Adjust the namespace URI if necessary
        
        # Find the <text> element in the TEI namespace
        text_element = tree.find(".//tei:text", namespaces=ns)
        
        if text_element is not None:
            # Remove praefatio from <text> element if it's there
            if tree.find(".//tei:div[@type='praefatio']", namespaces=ns) is not None:
                for element in tree.find(".//tei:div[@type='praefatio']", namespaces=ns):
                    element.getparent().remove(element)
            # Remove specified elements from the <text> element
            for tag in exclude_tags:
                for element in text_element.findall(f'.//tei:{tag}', namespaces=ns):
                    element.getparent().remove(element)
            
            # Extract the remaining text
            plaintext = text_element.xpath('string()')  # Get all text under <text>
            return clean(plaintext.strip())  # Return the plaintext, stripped of leading/trailing whitespace
        else:
            print("Element <text> not found in the XML.")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [None]:
def load_extract_files(files_path, exclude_tags):
    '''Load all files from files_path in list of dictionaries with urn, title, body of file'''
    xml_dir = os.path.expanduser(files_path)
    xml_paths = glob.glob(xml_dir)
    xml_paths = [path for path in sorted(xml_paths) if '__cts__' not in path]
    pta_dict = []
    for xml_path in xml_paths:
        file_dict = {}
        short_path = "/".join(xml_path.split("/")[9:])
        urn = "".join(short_path).split(".xml")[0]
        lang = urn.split("-")[1]
        if "Ms" in lang:
            lang = "Ms"
        else: 
            lang = re.sub("[0-9Bibex]","", lang)
        with open(xml_path) as file_open:
            plaintext = extract_plaintext_from_tei(file_open, exclude_tags)
        try:
            file_dict["urn"] = urn
            file_dict["lang"] = lang
            file_dict["text"] = plaintext
            pta_dict.append(file_dict)
        except Exception as e:
            print(f"An error occured: {e} in {xml_path}")
    return pta_dict

In [177]:
exclude_tags = ["note","rdg"]
severian_plaintext = load_extract_files(os.path.expanduser("~/Dokumente/projekte/pta_data/data/pta0001/*/*.xml", exclude_tags))

In [185]:
df = pd.DataFrame(severian_plaintext)

In [186]:
df

Unnamed: 0,urn,lang,text
0,pta0001.pta001.pta-MsAb,Ms,Τοῦ αὐτοῦ ὁμιλία εἰς τὸν περι φύσεως νό μον· κ...
1,pta0001.pta001.pta-MsAe,Ms,Τοῦ ἐν ἁγίοις πατρὸς ἡμῶν ἰωάννου τοῦ χρυσοστό...
2,pta0001.pta001.pta-MsAp,Ms,Τοῦ αὐτοῦ ὁμιλία εἰς τὸν περὶ φύσεως νόμον λόγ...
3,pta0001.pta001.pta-MsAv,Ms,Τοῦ αὐτοῦ εἰς τὸν περὶ φύσεως νόμον πᾶσα γραφὴ...
4,pta0001.pta001.pta-MsBe,Ms,Τοῦ αὐτοῦ ἁγίου ἰωάννου τοῦ χρυσοστόμου. λόγος...
...,...,...,...
206,pta0001.pta066.pta-xcl1,xcl,"Երանելոյն Սեբերիանոսի խօսք վասն Զատկի, Կաթարիս..."
207,pta0001.pta067.pta-xcl1,xcl,Նորին երանելոյն Սեբերիանոսի Եմեսու եպիսկոպոսի ...
208,pta0001.pta068.pta-xcl1,xcl,Նորին ասացեալ 'ի սուրբ վկայն յԱկակ մարտիրոս: Ո...
209,pta0001.pta069.pta-xcl1,xcl,Սբյն Սեւերիանոսի Եմեսու եպիսկոպոսի 'ի գալուստ ...


In [189]:
# Write the DataFrame to a CSV file in data directory
csv_file_path = 'data/severian_plaintext.csv'
df.to_csv(csv_file_path, index=False)

print(f'DataFrame written to {csv_file_path}')

DataFrame written to /home/stockhausen/Dokumente/BBAW/PTA/Workshop-Aarhus/data/severian_plaintext.csv


## Quotes

In [3]:
def extract_quotes_and_parents(tei_file):
    """
    Extracts all <quote> elements and their parent elements from a TEI XML file.

    Args:
        tei_file (str): Path to the TEI XML file.

    Returns:
        list of tuples: A list containing tuples of (parent_element, quote_element).
    """
    # Parse the XML file
    tree = etree.parse(tei_file)
    root = tree.getroot()

    # Namespace handling (if needed)
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}  # Adjust if your TEI XML uses a different namespace

    # List to hold the results
    results = []

    # Find all elements with references (quotes and allusions)
    for ref in root.findall(".//tei:ref[@decls='#biblical']", namespaces):
        entry = {}
        parent = ref.getparent()  # Get the parent element
        if parent is not None:
            entry["quote"] = clean(parent.xpath('string()').strip()) # etree.tostring(parent, encoding='unicode')
            entry["ref"] = ref.get("cRef") #etree.tostring(ref, encoding='unicode')
            results.append(entry)

    return results

In [4]:
def load_extract_files_quotes(files_path):
    '''Load all files from files_path in list of dictionaries with urn, title, body of file'''
    xml_dir = os.path.expanduser(files_path)
    xml_paths = glob.glob(xml_dir)
    xml_paths = [path for path in sorted(xml_paths) if '__cts__' not in path]
    pta_dict = []
    for xml_path in xml_paths:
        file_dict = {}
        short_path = "/".join(xml_path.split("/")[9:])
        urn = "".join(short_path).split(".xml")[0]
        lang = urn.split("-")[1]
        if "Ms" in lang:
            lang = "Ms"
        else: 
            lang = re.sub("[0-9]","", lang)
            lang = lang.replace("Bibex","")
        with open(xml_path) as file_open:
            plaintext = extract_quotes_and_parents(file_open)
        if plaintext:
            try:
                file_dict["urn"] = urn
                file_dict["lang"] = lang
                file_dict["quotes"] = plaintext
                pta_dict.append(file_dict)
            except Exception as e:
                print(f"An error occured: {e} in {xml_path}")
    return pta_dict

In [5]:
severian_quotes = load_extract_files_quotes(os.path.expanduser("~/Dokumente/projekte/pta_data/data/pta0001/*/*.xml"))

In [8]:
json_file_path = 'data/severian_quotes.json'
with open(json_file_path, 'w', encoding='utf8') as json_file:
    json.dump(severian_quotes, json_file, indent=4, ensure_ascii=False)  # indent for pretty printing

print(f"Data has been written to {json_file_path}")

Data has been written to data/severian_quotes.json
