# Import libraries and define functions

In [31]:
from pathlib import Path
import re
import os
from os.path import basename

def read_file(filepath):
    """
    Reads the content of a file.
    
    Args:
        filepath (Path): Path to the file.
        
    Returns:
        str: Content of the file.
    """
    with filepath.open('r', encoding='utf-8') as inp:
        return inp.read()

def remove_digits(text):
    """
    Removes digits from the given text.
    
    Args:
        text (str): Input text.
        
    Returns:
        str: Text without digits.
    """
    return re.sub(r'\d', '', text)

def check_dir_exists(directory_path):
    """
    Checks if a directory exists and creates it if it does not.
    
    Args:
        directory_path (str): Path to the directory.
        
    Returns:
        str: Directory path.
    """
    if not os.path.exists(directory_path):
        os.makedirs(directory_path, exist_ok=True)
        print("New directory created!")
    else:
        print("Directory already exists.")
    return directory_path

def write_txt(text_list, author, title):
    """
    Writes each text in the list to a separate file.
    
    Args:
        text_list (list): List of texts.
        author (str): Author of the texts.
        title (str): Title of the text.
    """
    directory_path = check_dir_exists(Path('../data/verse_corpus/'))
    for i, book in enumerate(text_list, start=1):
        filename = directory_path / f'{author}_{title}_{i}.txt'
        with filename.open('w', encoding='utf-8') as out:
            out.write(book.strip())
    print("Text files written to the directory.")

def write_txt_no_loop(text, author, title):
    """
    Writes the text to a single file.
    
    Args:
        text (str): Input text.
        author (str): Author of the text.
        title (str): Title of the text.
        
    Returns:
        str: Confirmation message.
    """
    directory_path = check_dir_exists(Path('../data/verse_corpus/'))
    filename = directory_path / f'{author}_{title}.txt'
    with filename.open('w', encoding='utf-8') as out:
        out.write(text)
    return "The file has been written to the directory."

def write_different_titles(list_, author, title):
    """
    Writes texts to separate files with different titles.
    
    Args:
        list_ (list): List of texts.
        author (str): Author of the texts.
        titles (list): List of titles for the texts.
        
    Returns:
        str: Confirmation message.
    """
    directory_path = check_dir_exists('../data/verse_corpus/')
    for n, book in enumerate(list_):
        # for title, book in zip(title, list_):
        filename = f'{directory_path}{author}_{title[n]}.txt'
        with open(filename, 'w', encoding='utf-8') as out:
            out.write(book.strip())
    return "The files have been written to the appointed directory"

def remove_personae(filepath):
    """
    Removes character names (personae) from the text.

    Args:
        filepath (Path): Path to the file.
        
    Returns:
        str: Text without character names.
    """
    words_not_to_remove = ["spectemus.", "spectate.", "adeste.", "occidam.", "renavigari.", "puto.", "tibi.",
                           "quod.", "'immane.", "eructat.", "gemuere.", "eatur.", "adeste.", "nihil.", "certos.",
                           "docet.", "occidam.", "nefas.", "quo.", "pulsabis.", "arsisti.", "distendit.", "iam.",
                           "sibi.", "simul.", "tamen.", "age.", "Neronis", "inquirite.", "non.", "aequas.",
                           "vincatur.", "ascribo.", "concipiat.", "novetur.", "effare.", "prius.", "donis.",
                           "scelus.", "supplicis.", "oblitterentur.", "timemur.", "canitque.", "peragite.",
                           "malum.", "Phoebusque.", "demissa."]

    text_no_digits = remove_digits(read_file(filepath)).replace(
        ".", ". ").split('\t\n')[1:]  # split the text from personae and add ". " every time a dot occurs
    personae = read_file(filepath).strip().split('\t\n')[0].split()
    text_lines = " ".join(text_no_digits).strip().split("\n")
    for line in text_lines:
        if line:  # check if line is not empty
            first_token = line.split(maxsplit=1)
            if len(first_token) >= 2 and first_token[1].startswith("("):
                continue
            if first_token and first_token[0].endswith("."):
                personae.append(first_token[0])
    personae_set = set(personae) - set(words_not_to_remove)
    result = "\n".join([" ".join([token for token in line.split() if token not in personae_set]) for line in text_lines])
    return result

def remove_titles(text):
    """
    Removes specified titles from the text.
    
    Args:
        text (str): Input text.
        
    Returns:
        str: Text without the specified titles.
    """
    titles_to_remove = [...]  # Your list of titles
    for title in titles_to_remove:
        text = text.replace(title, '')
    return text

def remove_latin_numbering(text):
    """
    Removes Latin numerals from the text.
    
    Args:
        text (str): Input text.
        
    Returns:
        str: Text without Latin numerals.
    """
    pattern = r"\b(?=[MDCLXVIΙ])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})([IΙ]X|[IΙ]V|V?[IΙ]{0,3})\b\.?"
    return re.sub(pattern, '', text)

# M. Annaeus Lucanus, *Pharsalia*

In [2]:
# read the text file
pharsalia_path = Path("../data/corpus_perseus_txt/luc_phar.txt")
pharsalia = read_file(pharsalia_path)

# two consecutive new lines indicate a book boundary in Pharsalia
# splitting using this delimiter will allow us to distinguish the books
pharsalia = pharsalia.split('\n\n')

# write texts
write_txt(pharsalia, author='luc', title='phars')

Directory already exists.
Text files written to the directory.


# Ovid, *Amores, Ars Amatoria, Epistulae (vel Heroides), Medicamina faciei femineae, Ars amatoria, Remedia amoris*

In [11]:
# read the text file
ovid_am_path = Path("../data/corpus_perseus_txt/ovid_am.txt")
ovid_am = read_file(ovid_am_path)

# two consecutive new lines show limit its book in ovid
# (these two lines were added manually after the removal of the titles)
ovid_works = ovid_am.split('\n\n')

# ovid_books = list(filter(lambda x: x and x.strip(), ovid_books))
# equivalent to: [book for book in books if book != None]
# and list(filter(lambda x: x.strip(), ovid_books))
# x on its own checks if value is truth (i.e., not None)
write_different_titles(ovid_works, author='ovid', title=[
    'am', 'epist', 'medicam', 'ars', 'remed'])

Directory already exists.


'The files have been written to the appointed directory'

# Ovid, *Fasti*

In [4]:
fasti_path = Path("../data/corpus_perseus_txt/ovid_fast.txt")
fasti = read_file(fasti_path)

# remove numbers that remained in the text as broken links
fasti = re.sub(r'\d', "", fasti)
write_txt_no_loop(fasti, author='ovid', title='fasti')

Directory already exists.


'The file has been written to the directory.'

# Ovid, *Metamorphoses*

In [12]:
metamorphoses_path = Path("../data/corpus_perseus_txt/ovid_met.txt")
metamorphoses = read_file(metamorphoses_path)

# nothing needs to be done so just write
write_txt_no_loop(metamorphoses, author='ovid', title='meta')

Directory already exists.


'The file has been written to the directory.'

# Ovid, *Ex Ponto*

In [15]:
ponto_path = Path("../data/corpus_perseus_txt/ovid_pont.txt")
ponto = read_file(ponto_path).strip('Ex Ponto')

# remove digits as footnotes inside the text
ponto = remove_digits(ponto)

# write file
write_txt_no_loop(ponto, author='ovid', title='ponto')

Directory already exists.


'The file has been written to the directory.'

# Ovid, *Tristia*

In [16]:
tristia_path = Path('../data/corpus_perseus_txt/ovid_tr.txt')
tristia = read_file(tristia_path)

# remove digits, if any
tristia = str(remove_digits(tristia))

# manually remove all the comments that were inside the text
write_txt_no_loop(tristia, author='ovid', title='tristia')

Directory already exists.


'The file has been written to the directory.'

# Ovid, *Ibis*

In [17]:
ibis_path = Path("../data/corpus_perseus_txt/ovid_ibis.txt")
ibis = read_file(ibis_path).replace(">", "").replace("  ", "\n").strip()

# check if there are numbers from the lines or as hypernotes
ibis = remove_digits(ibis)

# write the file
write_txt_no_loop(ibis, author='ovid', title='ibis')

Directory already exists.


'The file has been written to the directory.'

# Persius, *Satires*

In [18]:
satires_path = Path("../data/corpus_perseus_txt/persius_sat.txt")
satires = read_file(satires_path)

# remove forgotten line numbers or comments
satires = remove_digits(satires)

# split to individual satires ('\n\n')
satires = satires.split('\n\n')

write_txt(satires, author='persius', title='sati')

Directory already exists.
Text files written to the directory.


# Seneca,

In [19]:
# iterate over all files matching the pattern
for filepath in Path("../data/corpus_perseus_txt/").glob("sen_*.txt"):
    text_no_personae = remove_personae(filepath)
    # extract the author and title from the filename
    title = filepath.stem.split("_", 1)[1]
    author = filepath.stem.split("_", 1)[0]
    # write the cleaned text to a file
    write_txt_no_loop(text_no_personae, author=author, title=title)

Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.


# Silius Italicus , *Punica*

In [20]:
punica_path = Path("../data/corpus_perseus_txt/silius_punica.txt")
punica = remove_digits(
    read_file(punica_path))[7:].replace("  ", "\n").strip().split("\n\n\n")
# [7:] because the first line has the word Punica (i.e., the title of the work, thus it can be removed)

# write the files
write_txt(punica, author="sil.ita", title="pun")

Directory already exists.
Text files written to the directory.


# Statius, *Achilleis*

In [23]:
achilleid_path = Path("../data/corpus_perseus_txt/stat_achill.txt")
achilleid = remove_digits(
    read_file(achilleid_path)).replace("  ", "\n")

# write file
write_txt_no_loop(achilleid, author="stat", title="achill")

Directory already exists.


'The file has been written to the directory.'

# Statius, *Silvae*

In [24]:
silvae_path = Path("../data/corpus_perseus_txt/stat_silvae.txt")
silvae = remove_digits(
    read_file(silvae_path)).strip().split("\n\n\n")

# write separate files to test Ferri's hypothesis
write_txt(silvae, author="stat", title="silv")

Directory already exists.
Text files written to the directory.


# Statius, *Thebais*

In [25]:
thebais_path = Path("../data/corpus_perseus_txt/stat_theb.txt")
thebais = remove_digits(
    read_file(thebais_path)).strip().split("\n\n")

# write separate files
write_txt(thebais, author="stat", title="theb")

Directory already exists.
Text files written to the directory.


# Valerius Flaccus, *Argonautica*

In [26]:
argonautica_path = Path("../data/corpus_perseus_txt/valflac_argo.txt")
argonautica = remove_digits(
    read_file(argonautica_path)).strip().split("\n\n\n")

write_txt(argonautica, author="valflac", title="argon")

Directory already exists.
Text files written to the directory.


# Phaedrus, *Fables*

In [33]:
fables_path = Path('../data/corpus_perseus_txt/phaed_fables.txt')
clean_fables = remove_titles(fables_path).split('\n\n')  # split into books

# write each book to an individual file
write_txt(clean_fables, author='phaed', title='fables')

# Manilius, *Astronomica*

In [32]:
# loop over each file that follows the pattern manil.astro_*.txt
for filename in Path("../data/corpus_perseus_txt/").glob("manil.astro_*.txt"):
    astronomica = remove_digits(read_file(filename)).splitlines()[11:-11]
    # removes digits within the text,
    # split into lines to remove lines added by The Latin Library editors using index
    # join into a string to be able to write the whole poem and not lines
    astronomica = "\n".join(astronomica)
    author = os.path.splitext(basename(filename))[0].split(
        ".", 1)[0]  # extract the author name from the path
    title = os.path.splitext(basename(filename))[0].split(
        ".", 1)[1]  # extract the title name from the path
    # write books to individual txt files
    write_txt_no_loop(astronomica, author=author, title=title)

Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.
Directory already exists.


# Martial, *Epigrammata*

In [34]:
martial_path = Path("../../collection_data/data/corpus_perseus_txt/martial_lat_.txt")
martial = read_file(martial_path)
books = martial.split("\n\n")
processed_books = [remove_latin_numbering(book) for book in books]
if len(processed_books) == 14:
    # write each book to a separate file with continued numbering
    write_txt(processed_books, author='martial', title='epigr')

Directory already exists.
Text files written to the directory.
