# Collection of data
## Import libraries

In [1]:
import pandas as pd
import csv
import re

# process .rtf format documents
from striprtf.striprtf import rtf_to_text

# process the txt files
from glob import glob
import os
from os.path import basename

# checks to the repository of the files
from pathlib import Path
import cmd

## Collecting the Seneca's the Younger tragoediae

The tragedies that are attributed to Seneca the Younger were all scraped using the library [`rperseus`](https://github.com/ropensci/rperseus) created by David Ranzolin.

The following query was formulated:
```
urn <- "urn:cts:latinLit:phi1017"
seneca <- perseus_catalog %>%
    filter(group_name == "Seneca, Lucius Annaeus (Plays)",
           language == "lat") %>% 
    pull(urn) %>% 
    map_df(get_perseus_text)`
```
The query above creates a tibble, which is basically an R-based table with columns and rows. In order to extract this table into a `.csv` format file, so that someone can access it and process it with Pandas using Python, it was reformatted into a `csv` file using the `write.csv` function in R:

`write.csv(seneca, "../raw_text/seneca.csv")`

The texts were all extracted from [Perseus Digital Library](http://www.perseus.tufts.edu/hopper/).

In [2]:
seneca_df = pd.read_csv("../raw_text/seneca.csv", index_col=0).rename(
    columns={"label": "title", "group_name": "author"})
seneca_df

Unnamed: 0,text,urn,author,title,description,language,section
1,Inter multos ac varios errores temere inconsul...,urn:cts:latinLit:phi1017.phi013.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","de Beneficiis, Moral essays Vol 3","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,1
2,"Liber II Inspiciamus, Liberalis virorum optime...",urn:cts:latinLit:phi1017.phi013.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","de Beneficiis, Moral essays Vol 3","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,2
3,Liber III Non referre beneficiis gratiam et es...,urn:cts:latinLit:phi1017.phi013.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","de Beneficiis, Moral essays Vol 3","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,3
4,"Liber IV Ex omnibus, quae tractavimus, Aebuti ...",urn:cts:latinLit:phi1017.phi013.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","de Beneficiis, Moral essays Vol 3","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,4
5,Liber V In prioribus libris videbar consummass...,urn:cts:latinLit:phi1017.phi013.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","de Beneficiis, Moral essays Vol 3","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,5
...,...,...,...,...,...,...,...
1282,"Alcm. Quot ""misera in uno condidi natos parens...",urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,149
1283,Vox Herculis et Alcmena Quid me tenentem regna...,urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,150
1284,"Alcm. Vnde, unde sonus trepidas aures ferit? u...",urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,151
1285,"Alcm. Mane parumper, cessit, ex oculis, abit, ...",urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,152


In [3]:
# seneca_tragoediae = seneca_df.loc[seneca_df["label"].str.contains("Tragoediae", case = False)]
# case = False : just in case there is the word "Tragoediae" written in a different way

seneca_tragoediae = seneca_df.loc[seneca_df['title'].str.contains("Tragoediae",
                                                                  case=False)]
seneca_tragoediae

Unnamed: 0,text,urn,author,title,description,language,section
20,Iam nocte Titan dubius expulsa redit et nube m...,urn:cts:latinLit:phi1017.phi006.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Oedipus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,1
21,"Ioc. Quid iuvat, coniunx, mala gravare questu?...",urn:cts:latinLit:phi1017.phi006.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Oedipus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,2
22,"Oed. Abest pavoris crimen ac probrum procul, v...",urn:cts:latinLit:phi1017.phi006.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Oedipus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,3
23,Ioc. Quid sera mortis vota nunc demens facis? ...,urn:cts:latinLit:phi1017.phi006.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Oedipus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,4
24,"Oed. Ille, ille dirus callidi monstri cinis in...",urn:cts:latinLit:phi1017.phi006.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Oedipus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,5
...,...,...,...,...,...,...,...
1282,"Alcm. Quot ""misera in uno condidi natos parens...",urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,149
1283,Vox Herculis et Alcmena Quid me tenentem regna...,urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,150
1284,"Alcm. Vnde, unde sonus trepidas aures ferit? u...",urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,151
1285,"Alcm. Mane parumper, cessit, ex oculis, abit, ...",urn:cts:latinLit:phi1017.phi009.perseus-lat2,"Seneca, Lucius Annaeus (Plays)","Hercules Oetaeus, Tragoediae","Seneca, Lucius Annaeus, ca. 4 B.C.-65 A.D, cre...",lat,152


In [4]:
seneca_df = seneca_tragoediae.groupby(['author', 'title', 'urn', 'language'])[
    'text'].apply('\n'.join).reset_index()

In [5]:
# split the title and the genre to two different columns
# expand=True in order to break up the lists

title_genre = seneca_df['title'].str.split(pat=',', expand=True)
seneca_df[['title', 'genre']] = title_genre

# reorder the columns
seneca_df = seneca_df.reindex(
    columns=['author', 'title', 'genre', 'urn', 'language', 'text'])
seneca_df

Unnamed: 0,author,title,genre,urn,language,text
0,"Seneca, Lucius Annaeus (Plays)",Agamemnon,Tragoediae,urn:cts:latinLit:phi1017.phi007.perseus-lat2,lat,Opaca linquens Ditis inferni loca adsum profun...
1,"Seneca, Lucius Annaeus (Plays)",Hercules Furens,Tragoediae,urn:cts:latinLit:phi1017.phi001.perseus-lat2,lat,Soror Tonantis (hoc enim solum mihi nomen reli...
2,"Seneca, Lucius Annaeus (Plays)",Hercules Oetaeus,Tragoediae,urn:cts:latinLit:phi1017.phi009.perseus-lat2,lat,"Sator deorum, cuius excussum manu utraeque Pho..."
3,"Seneca, Lucius Annaeus (Plays)",Medea,Tragoediae,urn:cts:latinLit:phi1017.phi004.perseus-lat2,lat,"Di coniugales tuque genialis tori, Lucina, cus..."
4,"Seneca, Lucius Annaeus (Plays)",Octavia,Tragoediae,urn:cts:latinLit:phi1017.phi010.perseus-lat2,lat,Iam vaga caelo sidera fulgens Aurora fugat. su...
5,"Seneca, Lucius Annaeus (Plays)",Oedipus,Tragoediae,urn:cts:latinLit:phi1017.phi006.perseus-lat2,lat,Iam nocte Titan dubius expulsa redit et nube m...
6,"Seneca, Lucius Annaeus (Plays)",Phaedra,Tragoediae,urn:cts:latinLit:phi1017.phi005.perseus-lat2,lat,"Ite, umbrosas cingite silvas summaque montis i..."
7,"Seneca, Lucius Annaeus (Plays)",Phoenissae,Tragoediae,urn:cts:latinLit:phi1017.phi003.perseus-lat2,lat,Caeci parentis regimen et fessi unicum lateris...
8,"Seneca, Lucius Annaeus (Plays)",Thyestes,Tragoediae,urn:cts:latinLit:phi1017.phi008.perseus-lat2,lat,Quis inferorum sede ab infausta extrahit avido...
9,"Seneca, Lucius Annaeus (Plays)",Troades Furens,Tragoediae,urn:cts:latinLit:phi1017.phi002.perseus-lat2,lat,Quicumque regno fidit et magna potens dominatu...


In [6]:
# uncomment and run the following command only if you want to store locally in your computer the dataframe above
# seneca_df.to_csv("seneca_tragoediae.csv", encoding='utf-8')

In [7]:
abbr_titles_list = ['agam', 'herf', 'hero',
                    'mede', 'octa', 'oedi',
                    'phaed', 'phoe', 'thye', 'troa']
abbr_titles = pd.Series(abbr_titles_list)
abbr_titles

0     agam
1     herf
2     hero
3     mede
4     octa
5     oedi
6    phaed
7     phoe
8     thye
9     troa
dtype: object

In [8]:
texts_abbr_titles = pd.concat([seneca_df['text'], abbr_titles],
                              axis=1).rename(columns={0: "abbreviated_title"})
# texts_abbr_titles

In [9]:
def check_path_exist(pathname):
    Exist = os.path.exists(pathname)
    if not Exist:
        # create a new directory since it does not exist
        os.makedirs(pathname)
        print("The new directory was created successfully!")
    else:
        print("Directory already exists!")

In [10]:
pathname = "../data/corpus/"
check_path_exist(pathname)

Directory already exists!


In [11]:
path = "../data/corpus/"

n = 1
for text, title in zip(texts_abbr_titles["text"],
                       texts_abbr_titles["abbreviated_title"]):
    author = "seneca"
    filename = f"{author}_{title}{n}.txt"
    with open(path + filename, "w", encoding='utf-8') as inp:
        n += 1
        inp.writelines(str(text))

In [12]:
# create another corpus with only the Senecan plays inside
pathname = "../data/corpus_seneca/"
check_path_exist(pathname)

n = 1
for text, title in zip(texts_abbr_titles["text"],
                       texts_abbr_titles["abbreviated_title"]):
    author = "seneca"
    filename = f"{author}_{title}{n}.txt"
    with open(pathname + filename, "w", encoding='utf-8') as inp:
        n += 1
        inp.writelines(str(text))

Directory already exists!


By taking a close look to the texts in the previous dataframe about Seneca, someone will see that the play come along with the people that speak in every tragedy (i.e., the characters that participate in the play). An example can be given by the following extract:

    `El. Concede mortem,
    Aeg. Si recusares, darem, rudis est tyrannus morte qui poenam exigit.
    El. Mortem aliquid ultra est?
    Aeg. Vita, si cupias mori. abripite, famuli, monstrum et avectam procul ultra Mycenas ultimo in regni angulo vincite saeptam nocte tenebrosi specus, ut inquietam virginem carcer domet.
    Clyt. At ista poenas capite persolvet suo captiva coniunx, regii paelex tori. trahite, ut sequatur coniugem ereptum milli.
    Cass. Ne trahite. vestros ipsa praecedam gradus. perferre prima nuntium Phrygibus meis propero: repletum ratibus eversis mare, captas Mycenas, mille ductorem ducum, ut paria fata Troicis lueret malis, perisse dono feminae: stupro, dolo. nihil moramur, rapite, quin grates ago. iam, iam iuvat vixisse post Troiam, iuvat.
    Clyt. Furiosa, morere.
    Cass. Veniet et vobis furor.—`

What we are targeting to remove is names such as: 
        
        `"El.", "Aeg.", "Clyt.", etc.`
        
As it was said before, names like these correspond to the characters that participate in the play and are not vital part of the tragedies themselves.

Something that will be really useful for the oncoming analysis, would be to remove the names of these characters from the `.txt` files since otherwise will create a lot of noise to the results.



### First level of preprocessing of the Senecan plays

**Proceed to this level only if you want to remove the names of the characters from the texts**

A first level of preprocessing in the Senecan plays in order to prepare them for the extraction into `.txt` files that do not contain the names of the characters.

The first function `extracting_speaking_people` takes as an argument a filename. Aim of this function is to extract only the first token of every line. 

Why do we traget at the first token? 

In the files that were generated previously every speaking line is literally one line, which means that every character no matter how long they speak their words will be contained in one line. Following this line of thought, the first token of every line is the character that speaks at each moment of the play. In other words, this function extracts the first word of each line and then stores every name of every character that speaks in a list.

In [13]:
def extract_speaking_characters(filename):
    with open(filename, encoding='utf8') as inp:
        text = inp.read()
    lines = text.split("\n")[3:]
# start splitting after index = 3 because otherwise it will remove
# essential words of the text, since in the first line
# there is no speaking character
    names = []
    for line in lines:
        first_token = line.split()[0]
        names.append(first_token)
    return names

The next function, `remove_speaking_characters` takes two positional arguments, a filename and the list of the names that was generated by calling the function `extract_speaking_characters`.

This function aims to go through every token in the text and check if this token is in the list of names. It will only store the tokens that are not contained in the name list.

With a `for` loop both of these functions will be applied to the `.txt` files of the repository `data_txt_files` that have the following filename:
    `seneca_{number}.txt)`

The output will be stored in an overarching dictionary that has as keys the names of the files (i.e., `seneca_{number}.txt`) and as values the cleaned texts (i.e., the texts of the plays without the names of the characters in them.

In [14]:
def remove_speaking_characters(fname, names):
    with open(fname, encoding='utf-8') as inp:
        text = inp.read()
    clean_text = ' '.join([token for token in text.split()
                           if not any(word in token for word in names)])
    return clean_text

In [15]:
# result_seneca = {}
# for filename in glob('/Users/paschalis/Documents/MA_DH/Thesis/Code/Collection_data/data_txt_files/*seneca_[0-9]*.txt'):
#     print(filename)
#     speaking_characters = extract_speaking_characters(filename)
#     clean_text = remove_speaking_characters(filename, speaking_characters)
#     name = basename(filename)
#     result_seneca[name] = clean_text

In [16]:
# don't run this command if other files have been generated
result_seneca = {}
for filename in glob('../data/corpus/*'):
    speaking_characters = extract_speaking_characters(filename)
    clean_text = remove_speaking_characters(filename, speaking_characters)
    name = basename(filename)
    result_seneca[name] = clean_text

IndexError: list index out of range

After that, the only thing that remains is to rewrite the `.txt` files. This time, they will contain the plays without the speaking characters.

In [17]:
# path = "/Users/paschalis/Documents/MA_DH/Thesis/Code/Analysis/corpus/"

n = 1
for key, value in result_seneca.items():
    with open(path + key, "w", encoding='utf-8') as inp:
        n += 1
        inp.write(str(value))

In [18]:
# check if the repository is empty
# otherwise print the number of files that exist there

def check_repository_length(directory_path, directory_files):
    # Checking the length of list
    if len(directory_files) == 0:
        print("No files found in the directory.")
    else:
        print(f'''{len(directory_files)} files found in the directory.
        The following files are contained in this directory:\n''')
    cli.columnize(directory_files,
                  displaywidth=55)

In [19]:
directoryPath = "../data/corpus/"
directory_files = sorted(os.listdir(directoryPath))
cli = cmd.Cmd()


check_repository_length(directoryPath, directory_files)
# when running the cell for the first time the number should be 11
# 10 tragedies by Seneca and 1 DS_Store file

34 files found in the directory.
        The following files are contained in this directory:

.DS_Store          persius_sati2.txt  seneca_phoe8.txt 
lucan_beci1.txt    persius_sati3.txt  seneca_thye9.txt 
lucan_beci10.txt   persius_sati4.txt  seneca_troa10.txt
lucan_beci2.txt    persius_sati5.txt  statius_achi1.txt
lucan_beci3.txt    persius_sati6.txt  statius_achi2.txt
lucan_beci4.txt    seneca_agam1.txt   statius_silv3.txt
lucan_beci5.txt    seneca_herf2.txt   statius_silv4.txt
lucan_beci6.txt    seneca_hero3.txt   statius_silv5.txt
lucan_beci7.txt    seneca_mede4.txt   statius_silv6.txt
lucan_beci8.txt    seneca_octa5.txt   statius_silv7.txt
lucan_beci9.txt    seneca_oedi6.txt 
persius_sati1.txt  seneca_phaed7.txt


## Collecting Persius' *Satires*

*Satires* or *Satures* by Aulus Persius Flaccus were collected from the [Project Gutenberg](https://www.gutenberg.org/ebooks/22119) and they were saved in an overarching `.rtf` format file. The file contained the 6 *Satires* written by Persius.

In [20]:
with open("../raw_text/satires_perseus.rtf") as infile:
    content = infile.read()  # reading the file
    satires = rtf_to_text(content)  # convert into a string
    satires_cleaned = re.sub('[\d-]', '', satires).split("SATURA")[1:]
    # split the text in each Satura
    # [1:] because otherwise creates an empty list
    # `\d` regex matches any digit character
    # digit character were removed because the file contained the number of the lines

### First level of preprocessing to the *Satires* by Persius

Since the current format of the text has a whitespace in front of every line we need to remove these whitespaces. Every satira now looks like this:
```
|  O curas hominum! o quantum est in rebus inane!
|  ‘Quis leget haec?’ Min tu istud ais? nemo hercule! ‘Nemo?’
|  Vel duo, vel nemo. ‘Turpe et miserabile!’ Quare?
|  ne mihi Polydamas et Troiades Labeonem
|  praetulerint? nugae. non, si quid turbida Roma                   
|  elevet, accedas examenque inprobum in illa
|  castiges trutina, nec te quaesiveris extra.
```
Moreover, every *Satira* contains in the first three lines of every Satira there is the number of the book in a latin number (I, II, III, etc.) and also some extra spaces that can be removed, because they are not essential part of the original text.

To accomplish these two targets, a function was created (`def remove_leading_whitespaces`. This function takes one positional argument: a set of texts.

The result of this function will be saved in a dictionary and then the values of this dictionary will be written in separate `.txt` files.

In [21]:
def remove_leading_whitespaces(texts, n_index):
    stripped_texts = [line.strip()
                      for line in str(texts).splitlines()][n_index:]
    output = "\n".join(stripped_texts).strip()
    return output

In [22]:
num = 1
n_index = 1
# return only after [1:] because otherwise we will have information
# that is not included in the original text
result_persius = {}
for text in satires_cleaned:
    values = remove_leading_whitespaces(text, n_index)
    result_persius[num] = values
    num += 1

In [23]:
path = "../data/corpus/"
n = 1
author = "persius"
for satira in result_persius.values():
    filename = f"{author}_sati{n}.txt"
    with open(path + filename, "w", encoding='utf-8') as inp:
        n += 1
        inp.writelines(satira)

In [24]:
directoryPath = "../data/corpus/"
directory_files = sorted(os.listdir(directoryPath))
cli = cmd.Cmd()
check_repository_length(directoryPath, directory_files)

# when running this cell for the first time the number should be 17
# 10 tragedies by Seneca
# 6 Satires by Persius
# and 1 DS_Store file

34 files found in the directory.
        The following files are contained in this directory:

.DS_Store          persius_sati2.txt  seneca_phoe8.txt 
lucan_beci1.txt    persius_sati3.txt  seneca_thye9.txt 
lucan_beci10.txt   persius_sati4.txt  seneca_troa10.txt
lucan_beci2.txt    persius_sati5.txt  statius_achi1.txt
lucan_beci3.txt    persius_sati6.txt  statius_achi2.txt
lucan_beci4.txt    seneca_agam1.txt   statius_silv3.txt
lucan_beci5.txt    seneca_herf2.txt   statius_silv4.txt
lucan_beci6.txt    seneca_hero3.txt   statius_silv5.txt
lucan_beci7.txt    seneca_mede4.txt   statius_silv6.txt
lucan_beci8.txt    seneca_octa5.txt   statius_silv7.txt
lucan_beci9.txt    seneca_oedi6.txt 
persius_sati1.txt  seneca_phaed7.txt


## Collecting Lucan's *Bellum Civile*

The work *De Bellum Civile* by Marcus Annaeus Lucanus were collected in an overarching `.rtf` format file from [The Latin Library](http://thelatinlibrary.com/lucan.html). This file contained the 10 books that consist the *Bellum Civile* corpus.

In [25]:
with open("../raw_text/lucan_bellum_civile.rtf") as infile:
    content = infile.read()  # reading the file
    # convertinf rtf into a single string
    bellum_civile = rtf_to_text(content).strip()
    bellum_civile_cleaned = re.sub(
        '[\d-]', '', bellum_civile).split("Liber ")[1:]
    # removing the line numbers and split according to the books ("Liber")
    # skipping the index 0 list since it is empty

In [26]:
num = 1
result_lucan = {}
for text in bellum_civile_cleaned:
    values = remove_leading_whitespaces(text, n_index=None)
    result_lucan[num] = values
    num += 1

In [27]:
path = "../data/corpus/"
n = 1
author = "lucan"
for book in result_lucan.values():
    filename = f"{author}_beci{n}.txt"
    with open(path + filename, "w", encoding='utf-8') as inp:
        n += 1
        inp.writelines(book)

In [28]:
# the number now should be 27 (.txt files):
# 10 tragedies by Seneca the Younger
# 6 Satires by Persius
# 10 Books of De Bellum Civile by Lucan
# + 1(.DS_Store is an automatically generated file by the Macintosh OSX operating system)

# calling the function
directoryPath = "../data/corpus/"
directory_files = sorted(os.listdir(directoryPath))
cli = cmd.Cmd()

check_repository_length(directoryPath, directory_files)

34 files found in the directory.
        The following files are contained in this directory:

.DS_Store          persius_sati2.txt  seneca_phoe8.txt 
lucan_beci1.txt    persius_sati3.txt  seneca_thye9.txt 
lucan_beci10.txt   persius_sati4.txt  seneca_troa10.txt
lucan_beci2.txt    persius_sati5.txt  statius_achi1.txt
lucan_beci3.txt    persius_sati6.txt  statius_achi2.txt
lucan_beci4.txt    seneca_agam1.txt   statius_silv3.txt
lucan_beci5.txt    seneca_herf2.txt   statius_silv4.txt
lucan_beci6.txt    seneca_hero3.txt   statius_silv5.txt
lucan_beci7.txt    seneca_mede4.txt   statius_silv6.txt
lucan_beci8.txt    seneca_octa5.txt   statius_silv7.txt
lucan_beci9.txt    seneca_oedi6.txt 
persius_sati1.txt  seneca_phaed7.txt


## Collecting Statius *Silvae* and *Achilleid*
*Silvae* and *Achilleid* that are attributed to Publius Papinius Statius were all manually collected from the [Latin Library](http://thelatinlibrary.com/statius.html).

They were saved in two overarching `.rtf` documents; one for *Achilleis* and one for *Silvae*.

### First level of preprocessing of *Achilleid*
As with the *Satires* by Persius, we need to remove the leading whitespace and the numbers of the verses from each line. We will follow the same process for both of the texts, *Achilleid* and *Silvae*.

In [29]:
with open("../raw_text/statius_achilleid.rtf") as infile:
    content = infile.read()  # reading the file
    achilleid = rtf_to_text(content).strip()  # convert into a string
    achilleid_cleaned = re.sub('[\d-]', '', achilleid).split("Achilleid")[1:]

In [30]:
num = 1
result_achilleid = {}
for text in achilleid_cleaned:
    values = remove_leading_whitespaces(text, n_index=2)
    result_achilleid[num] = values
    num += 1

In [31]:
path = "../data/corpus/"
n = 1
author = "statius"
for book in result_achilleid.values():
    filename = f"{author}_achi{n}.txt"
    with open(path + filename, "w", encoding='utf-8') as inp:
        n += 1
        inp.writelines(book)

In [32]:
directoryPath = "../data/corpus/"
directory_files = sorted(os.listdir(directoryPath))
cli = cmd.Cmd()

# calling the function

check_repository_length(directoryPath, directory_files)

34 files found in the directory.
        The following files are contained in this directory:

.DS_Store          persius_sati2.txt  seneca_phoe8.txt 
lucan_beci1.txt    persius_sati3.txt  seneca_thye9.txt 
lucan_beci10.txt   persius_sati4.txt  seneca_troa10.txt
lucan_beci2.txt    persius_sati5.txt  statius_achi1.txt
lucan_beci3.txt    persius_sati6.txt  statius_achi2.txt
lucan_beci4.txt    seneca_agam1.txt   statius_silv3.txt
lucan_beci5.txt    seneca_herf2.txt   statius_silv4.txt
lucan_beci6.txt    seneca_hero3.txt   statius_silv5.txt
lucan_beci7.txt    seneca_mede4.txt   statius_silv6.txt
lucan_beci8.txt    seneca_octa5.txt   statius_silv7.txt
lucan_beci9.txt    seneca_oedi6.txt 
persius_sati1.txt  seneca_phaed7.txt


### First level of preprocessing of *Silvae*

In [33]:
with open("../raw_text/statius_silvae.rtf") as infile:
    text = infile.read()  # reading the file
    silvae = rtf_to_text(text).strip()  # convert into a string
    silvae_cleaned = re.sub('[\d-]', '', silvae).split("SILVAE")[1:]

In [34]:
num = 1
result_silvae = {}
for book in silvae_cleaned:
    values = remove_leading_whitespaces(book, n_index=None)
    result_silvae[num] = values
    num += 1

In [35]:
path = "../data/corpus/"
n = 3
# n = 3 since we want to continue the counting from the two book of Achilleid and onwards
author = "statius"
for book in result_silvae.values():
    filename = f"{author}_silv{n}.txt"
    with open(path + filename, "w", encoding='utf-8') as inp:
        n += 1
        inp.writelines(book)

In [36]:
directoryPath = "../data/corpus/"
directory_files = sorted(os.listdir(directoryPath))
cli = cmd.Cmd()

# calling the function

check_repository_length(directoryPath, directory_files)

34 files found in the directory.
        The following files are contained in this directory:

.DS_Store          persius_sati2.txt  seneca_phoe8.txt 
lucan_beci1.txt    persius_sati3.txt  seneca_thye9.txt 
lucan_beci10.txt   persius_sati4.txt  seneca_troa10.txt
lucan_beci2.txt    persius_sati5.txt  statius_achi1.txt
lucan_beci3.txt    persius_sati6.txt  statius_achi2.txt
lucan_beci4.txt    seneca_agam1.txt   statius_silv3.txt
lucan_beci5.txt    seneca_herf2.txt   statius_silv4.txt
lucan_beci6.txt    seneca_hero3.txt   statius_silv5.txt
lucan_beci7.txt    seneca_mede4.txt   statius_silv6.txt
lucan_beci8.txt    seneca_octa5.txt   statius_silv7.txt
lucan_beci9.txt    seneca_oedi6.txt 
persius_sati1.txt  seneca_phaed7.txt
