In [16]:
import re
from collections import Counter
from cltk.tag import ner
from cltk.lemmatize.lat import LatinBackoffLemmatizer
from cltk.data.fetch import FetchCorpus

In [17]:
# Uncomment below to download models for Latin lemmatizer
# corpus_downloader = FetchCorpus(language="lat")
# corpus_downloader.import_corpus("lat_models_cltk")


In [18]:
# AP Corpus
ap_corpus = [
    "vergil/aen1.txt",
    "vergil/aen2.txt",
    "vergil/aen4.txt",
    "vergil/aen6.txt",
    "caesar/gall1.txt",
    "caesar/gall4.txt",
    "caesar/gall5.txt",
    "caesar/gall6.txt"
]

In [19]:
# Create a dictionary of clean text, lemmata, and word frequencies for each text in the corpus.
def lemmatize_text(text):
    lemmata = lemmatizer.lemmatize(text.split(" "))
    return [l for l in lemmata if l[1] != '']

path_to_texts = "/Users/tyler/cltk_data/latin/text/latin_text_latin_library/"
pattern = re.compile('[^a-zA-Z]')
texts = {}
lemmatizer = LatinBackoffLemmatizer()
for text in ap_corpus:
    text_path = path_to_texts + text
    with open(text_path) as file:
        raw_text = file.read()
    clean_text = pattern.sub(' ', raw_text).lower().strip()
    text_title = text[:-4]
    lemmata = lemmatize_text(clean_text)
    texts[text_title] = {
        "clean_text": clean_text,
        "lemmata": lemmata,
        "word_frequencies": Counter([l[1] for l in lemmata])
    }


In [20]:
# Load and parse Diederich vocab
with open("diederich.txt") as f:
    diederich_vocab = [l.split(";")[0] for l in f.readlines()[3:]]

In [37]:
VOCAB_SIZE=1250
for k, v in texts.items():
    print(f"{k} Vocab Statistics")
    total_words = sum(v["word_frequencies"].values())
    print(f"Total Words: {total_words}")
    total_vocab = len(v["word_frequencies"].keys())
    print(f"Total Vocab: {total_vocab}")
    total_known_words = 0
    total_known_vocab = 0
    for word, freq in v["word_frequencies"].items():
        if word in diederich_vocab[:VOCAB_SIZE]:
            total_known_words += freq
            total_known_vocab += 1
    print(f"Total Known Words: {total_known_words}")
    print(f"Total Known Vocab: {total_known_vocab}")
    print(f"Percent Known Words: {total_known_words/total_words}")
    print(f"Percent Known Vocab: {total_known_vocab/total_vocab}")
    print("=" * 10)

vergil/aen1 Vocab Statistics
Total Words: 4896
Total Vocab: 1753
Total Known Words: 2995
Total Known Vocab: 647
Percent Known Words: 0.6117238562091504
Percent Known Vocab: 0.3690815744438106
vergil/aen2 Vocab Statistics
Total Words: 5191
Total Vocab: 1777
Total Known Words: 3046
Total Known Vocab: 623
Percent Known Words: 0.5867848198805625
Percent Known Vocab: 0.3505908835115363
vergil/aen4 Vocab Statistics
Total Words: 4588
Total Vocab: 1704
Total Known Words: 2773
Total Known Vocab: 639
Percent Known Words: 0.6044027898866609
Percent Known Vocab: 0.375
vergil/aen6 Vocab Statistics
Total Words: 5869
Total Vocab: 2048
Total Known Words: 3690
Total Known Vocab: 716
Percent Known Words: 0.6287272107684444
Percent Known Vocab: 0.349609375
caesar/gall1 Vocab Statistics
Total Words: 8197
Total Vocab: 1546
Total Known Words: 5087
Total Known Vocab: 575
Percent Known Words: 0.6205928998414054
Percent Known Vocab: 0.3719275549805951
caesar/gall4 Vocab Statistics
Total Words: 4604
Total Vocab