In [1]:
import re
from collections import Counter
from cltk.tag.ner import tag_ner
from cltk.lemmatize.lat import LatinBackoffLemmatizer
from cltk.data.fetch import FetchCorpus
from vocab_utils import CorpusAnalytics, diederich_vocab

In [2]:
# Uncomment below to download models for Latin lemmatizer
# corpus_downloader = FetchCorpus(language="lat")
# corpus_downloader.import_corpus("lat_models_cltk")


In [3]:
# AP Corpus
ap_latin_texts = [
        "vergil/aen1.txt",
        "vergil/aen2.txt",
        "vergil/aen4.txt",
        "vergil/aen6.txt",
        "caesar/gall1.txt",
        "caesar/gall4.txt",
        "caesar/gall5.txt",
        "caesar/gall6.txt"
    ]
path_to_texts = "/Users/tyler/cltk_data/latin/text/latin_text_latin_library/"
ap_latin_corpus = CorpusAnalytics(ap_latin_texts, path_to_texts, "lat")

In [4]:
VOCAB_SIZE=1500
for k, v in ap_latin_corpus.analytics.items():
    print(f"{k} Vocab Statistics")
    total_words = sum(v["filtered_lemmata_frequencies"].values())
    print(f"Total Words: {total_words}")
    total_vocab = len(v["filtered_lemmata_frequencies"].keys())
    print(f"Total Vocab: {total_vocab}")
    total_known_words = 0
    total_known_vocab = 0
    for word, freq in v["filtered_lemmata_frequencies"].items():
        if word in diederich_vocab[:VOCAB_SIZE]:
            total_known_words += freq
            total_known_vocab += 1
    print(f"Total Known Words: {total_known_words}")
    print(f"Total Known Vocab: {total_known_vocab}")
    print(f"Percent Known Words: {total_known_words/total_words}")
    print(f"Percent Known Vocab: {total_known_vocab/total_vocab}")
    print("=" * 10)

vergil/aen1 Vocab Statistics
Total Words: 4808
Total Vocab: 1748
Total Known Words: 3182
Total Known Vocab: 736
Percent Known Words: 0.6618136439267887
Percent Known Vocab: 0.42105263157894735
vergil/aen2 Vocab Statistics
Total Words: 5145
Total Vocab: 1706
Total Known Words: 3303
Total Known Vocab: 712
Percent Known Words: 0.6419825072886297
Percent Known Vocab: 0.41735052754982416
vergil/aen4 Vocab Statistics
Total Words: 4526
Total Vocab: 1652
Total Known Words: 2993
Total Known Vocab: 715
Percent Known Words: 0.6612903225806451
Percent Known Vocab: 0.4328087167070218
vergil/aen6 Vocab Statistics
Total Words: 5788
Total Vocab: 1977
Total Known Words: 4020
Total Known Vocab: 820
Percent Known Words: 0.6945404284727021
Percent Known Vocab: 0.4147698533131007
caesar/gall1 Vocab Statistics
Total Words: 7838
Total Vocab: 1535
Total Known Words: 5522
Total Known Vocab: 650
Percent Known Words: 0.7045164582801735
Percent Known Vocab: 0.4234527687296417
caesar/gall4 Vocab Statistics
Total W

In [7]:
# Sample of text, tokens, and NERs
ap_latin_corpus.analytics["vergil/aen1"]["clean_text"][:1000]

'Vergil  Aeneid I                                      P  VERGILI MARONIS AENEIDOS LIBER PRIMVS       Arma virumque cano  Troiae qui primus ab oris  Italiam  fato profugus  Laviniaque venit  litora  multum ille et terris iactatus et alto  vi superum saevae memorem Iunonis ob iram   multa quoque et bello passus  dum conderet urbem        inferretque deos Latio  genus unde Latinum   Albanique patres  atque altae moenia Romae       Musa  mihi causas memora  quo numine laeso   quidve dolens  regina deum tot volvere casus  insignem pietate virum  tot adire labores        impulerit  Tantaene animis caelestibus irae       Urbs antiqua fuit  Tyrii tenuere coloni   Karthago  Italiam contra Tiberinaque longe  ostia  dives opum studiisque asperrima belli   quam Iuno fertur terris magis omnibus unam        posthabita coluisse Samo  hic illius arma   hic currus fuit  hoc regnum dea gentibus esse   si qua fata sinant  iam tum tenditque fovetque   Progeniem sed enim Troiano a sanguine duci  audierat 

In [8]:
ap_latin_corpus.analytics["vergil/aen1"]["tokenized_lemmata"][:50]


['Vergil',
 'Aeneid',
 'I',
 'P',
 'VERGILI',
 'MARONIS',
 'AENEIDOS',
 'LIBER',
 'PRIMVS',
 'Arma',
 'vir',
 'cano',
 'Troius',
 'qui',
 'primus',
 'ab',
 'ora',
 'Italia',
 'fatum',
 'profugus',
 'Laviniaque',
 'venio',
 'litus',
 'multus',
 'ille',
 'et',
 'terra',
 'jacto',
 'et',
 'altus',
 'vis',
 'superi',
 'saevus',
 'memoro',
 'Juno',
 'ob',
 'ira',
 'multus',
 'quoque',
 'et',
 'bellum',
 'patior',
 'dum',
 'condo',
 'urbs',
 'infero',
 'deus',
 'Latius',
 'genus',
 'unde']

In [10]:
ap_latin_corpus.analytics["vergil/aen1"]["ner_filtered_tokens"][:50]

['vergil',
 'aeneid',
 'i',
 'p',
 'vergili',
 'maronis',
 'aeneidos',
 'liber',
 'arma',
 'vir',
 'cano',
 'troius',
 'qui',
 'primus',
 'ab',
 'ora',
 'fatum',
 'profugus',
 'laviniaque',
 'venio',
 'litus',
 'multus',
 'ille',
 'et',
 'terra',
 'jacto',
 'et',
 'altus',
 'vis',
 'superi',
 'saevus',
 'memoro',
 'juno',
 'ob',
 'ira',
 'multus',
 'quoque',
 'et',
 'bellum',
 'patior',
 'dum',
 'condo',
 'urbs',
 'infero',
 'deus',
 'latius',
 'genus',
 'unde',
 'pater',
 'atque']

In [11]:
ap_latin_corpus.analytics["vergil/aen1"]["ners"][:50]


['primvs',
 'italia',
 'latinus',
 'albanus',
 'roma',
 'carthago',
 'italia',
 'tiberinus',
 'samos',
 'libya',
 'romanus',
 'siculus',
 'italia',
 'talia',
 'aeolia',
 'ilium1',
 'italia',
 'simois',
 'talia',
 'eurus',
 'syrtis',
 'abas',
 'aletes',
 'eurum',
 'eure',
 'cymothoe',
 'libya',
 'phrygia',
 'caicus',
 'talia',
 'gyas',
 'libya',
 'italia',
 'certe',
 'romanus',
 'illyricus',
 'liburnus',
 'patavium',
 'olli',
 'lavinium',
 'italia',
 'ilium1',
 'lavinium',
 'hectoreus',
 'mars',
 'ilium1',
 'romulus',
 'martius',
 'romanus',
 'romanus']