In [17]:
from collections import Counter
from lxml import etree
from MyCapytain.common.constants import Mimetypes
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
import pandas as pd

with open("../tei/tlg0525.tlg001.perseus-grc2.xml") as f:
    text = CapitainsCtsText(urn="urn:cts:greekLit:tlg0525.tlg001.perseus-grc2", resource=f)

urns = []
raw_xmls = []
unannotated_strings = []

for ref in text.getReffs(level=len(text.citation)):
    urn = f"{text.urn}:{ref}"
    node = text.getTextualNode(ref)
    raw_xml = node.export(Mimetypes.XML.TEI)
    tree = node.export(Mimetypes.PYTHON.ETREE)
    s = etree.tostring(tree, encoding="unicode", method="text")

    urns.append(urn)
    raw_xmls.append(raw_xml)
    unannotated_strings.append(s)

d = {
    "urn": pd.Series(urns, dtype="string"),
    "raw_xml": raw_xmls,
    "unannotated_strings": pd.Series(unannotated_strings, dtype="string")
}
pausanias_df = pd.DataFrame(d)
pausanias_df['whitespaced_tokens'] = pausanias_df['unannotated_strings'].str.split()
pausanias_4 = pausanias_df.loc[pausanias_df["urn"].str.contains('.perseus-grc2:4', regex=False)]
pausanias_5 = pausanias_df.loc[pausanias_df["urn"].str.contains('.perseus-grc2:5', regex=False)]
pausanias_6 = pausanias_df.loc[pausanias_df["urn"].str.contains('.perseus-grc2:6', regex=False)]
subset = [pausanias_4, pausanias_5, pausanias_6]

# TOP N TYPES! 
for p in subset:
    please_work = p['whitespaced_tokens']
    type_counts = Counter(please_work.explode())
    print(type_counts.most_common(100))



[('καὶ', 1246), ('δὲ', 909), ('ἐς', 397), ('τε', 371), ('τὴν', 325), ('τὸ', 303), ('τῶν', 290), ('μὲν', 283), ('ἐν', 266), ('τοῦ', 257), ('οἱ', 250), ('τὰ', 238), ('τῆς', 223), ('τὸν', 204), ('ὁ', 188), ('τοὺς', 171), ('ὡς', 166), ('τοῖς', 160), ('γὰρ', 138), ('τῇ', 133), ('τῷ', 133), ('ἐπὶ', 126), ('οὐ', 102), ('ἐκ', 100), ('ὑπὸ', 85), ('δὴ', 82), ('ἡ', 80), ('Μεσσηνίων', 72), ('ἀπὸ', 71), ('πρὸς', 69), ('οὐκ', 66), ('ἢ', 65), ('παρὰ', 64), ('κατὰ', 61), ('μάλιστα', 60), ('τὰς', 58), ('Μεσσηνίοις', 55), ('ἦν', 55), ('τότε', 53), ('ἤδη', 52), ('Λακεδαιμονίων', 52), ('ἔτι', 48), ('σφισιν', 46), ('Μεσσήνιοι', 45), ('περὶ', 45), ('οὖν', 45), ('ἐστιν', 41), ('Μεσσηνίους', 40), ('εἶναι', 39), ('μετὰ', 38), ('ἐξ', 37), ('αὐτὸν', 37), ('Λακεδαιμόνιοι', 37), ('τοῦτο', 36), ('γε', 36), ('δέ', 34), ('ὕστερον', 34), ('Λακεδαιμονίοις', 34), ('διὰ', 33), ('ἄλλα', 33), ('ταῦτα', 32), ('αὐτῶν', 32), ('Λακεδαιμονίους', 30), ('τι', 30), ('Ἀριστομένης', 30), ('μὴ', 29), ('σφᾶς', 29), ('ἐνταῦθα', 29), ('

Right now, this is just telling me that Pausanias is using a lot of particles and definite articles. There could be an argument here that these are the 100 most common tokens for each book. There's also some preposition representation here. Let's filter out stopwords and see what changes...

In [18]:
import spacy

nlp = spacy.load("grc_proiel_sm", disable=["ner"])
tokenizer = nlp.tokenizer

for s in subset:
    s['tokens'] = s['unannotated_strings'].apply(tokenizer)
    types = [t.text for t in s['tokens'].explode() if not t.is_stop and t.is_alpha]
    type_counts = Counter(types)
    print(type_counts.most_common(100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s['tokens'] = s['unannotated_strings'].apply(tokenizer)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s['tokens'] = s['unannotated_strings'].apply(tokenizer)


[('Μεσσηνίων', 82), ('μάλιστα', 62), ('τότε', 58), ('Μεσσηνίοις', 57), ('Λακεδαιμονίων', 53), ('ἤδη', 52), ('σφισιν', 48), ('Μεσσηνίους', 47), ('Μεσσήνιοι', 47), ('Λακεδαιμόνιοι', 40), ('ὕστερον', 37), ('Λακεδαιμονίοις', 37), ('γενέσθαι', 36), ('ἐνταῦθα', 34), ('Ἀριστομένης', 32), ('Λακεδαιμονίους', 31), ('σφᾶς', 31), ('σφίσιν', 30), ('πόλιν', 29), ('ὄνομα', 26), ('ἅτε', 26), ('Μεσσήνην', 25), ('σφισι', 25), ('πρότερον', 23), ('δʼ', 23), ('ὕδωρ', 23), ('θεῶν', 22), ('πάντα', 22), ('ἅμα', 21), ('μάχης', 20), ('πολέμου', 20), ('Ἀριστομένους', 20), ('ἐγένετο', 19), ('ὅσοι', 19), ('Ἀριστοδήμου', 19), ('Ἀρκάδων', 19), ('Ἀριστομένην', 18), ('Μεσσήνης', 17), ('λέγουσι', 17), ('ἐπεὶ', 17), ('αὖθις', 17), ('ἱερὸν', 17), ('σφίσι', 17), ('χώραν', 16), ('Σπάρτην', 16), ('πολὺ', 16), ('πόλεμον', 16), ('Ἰθώμην', 16), ('ἄγαλμα', 16), ('λέγουσιν', 15), ('Ἑλλήνων', 15), ('ἔνθα', 15), ('παῖδα', 15), ('βασιλέα', 15), ('κατʼ', 15), ('ἀπʼ', 14), ('διʼ', 14), ('ἄνδρα', 14), ('πρὸ', 14), ('παρʼ', 14), ('Μεσσ

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s['tokens'] = s['unannotated_strings'].apply(tokenizer)


Which leaves us with lemmata. I'll go more in depth once I have my three datasets.

In [22]:
for s in subset:
    raw_texts = [t for t in s['unannotated_strings']]
    annotated_texts = nlp.pipe(raw_texts, batch_size=100)
    s['nlp_docs'] = list(annotated_texts)
    lemmata = [t.lemma_ for t in s['nlp_docs'].explode() if not t.is_stop and t.is_alpha]
    lemmata_counts = Counter(lemmata)
    print(lemmata_counts.most_common(100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s['nlp_docs'] = list(annotated_texts)


[('Μεσσήνιος', 277), ('Λακεδαιμόνιος', 162), ('σφεῖς', 158), ('γίγνομαι', 131), ('ἔχω', 126), ('λέγω', 95), ('πολύς', 93), ('Ἀριστομένης', 82), ('ποιέω', 78), ('ἀνήρ', 73), ('πόλις', 73), ('πᾶς', 67), ('μάλα', 66), ('θεός', 63), ('τότε', 58), ('παῖς', 57), ('Μεσσήνη', 56), ('ἤδη', 52), ('φημί', 51), ('καλέω', 50), ('πόλεμος', 47), ('μάχη', 45), ('πρῶτος', 42), ('λόγος', 42), ('ἐκεῖνος', 41), ('ὕστερος', 41), ('ὅσος', 41), ('βασιλεύς', 40), ('Ἀρκάς', 40), ('ἄγω', 39), ('γυνή', 37), ('Ἀριστόδημος', 37), ('ἀφικνέομαι', 36), ('ἐνταῦθα', 36), ('ἀρχή', 34), ('Ἰθώμη', 34), ('μέγας', 34), ('δίδωμι', 34), ('μέλλω', 34), ('θυγάτηρ', 31), ('πρότερος', 30), ('ἔτος', 30), ('βασιλεύω', 29), ('δοκέω', 29), ('ὕδωρ', 29), ('χράω', 29), ('Σπάρτη', 29), ('ὄνομα', 28), ('γῆ', 27), ('χώρα', 26), ('ἀποθνῄσκω', 26), ('λαμβάνω', 26), ('Ἕλλην', 26), ('ἔργον', 26), ('χρόνος', 26), ('ὁράω', 26), ('ἱερόν', 26), ('ἅτε', 26), ('αἱρέω', 26), ('πάρειμι', 26), ('ἄγαλμα', 26), ('ἐθέλω', 25), ('οἶδα', 25), ('νομίζω', 25

# two
For the findings, I am choosing to focus mostly on lemmata as seen for books 4-6. That said, not only is Messenian one of the most common lemmata for book 4, but a fair number of the most prevalent types seem to be types of this lemma, and the same can be said for Spartan. In both of the other books, the most common lemma is  Ἠλεῖος. In book 5 ἄγαλμα is used 70 times and then that drops to 28 in the next book. ἄγαλμα is also used 26 times in book 4. The related word εἰκών appears 82 times, and only in book 6. In book 6 the lemma  νίκη appears 111 times and the lemma νικάω appears 41 times. The lemma σφεῖς appears in book 4 158 times, then 74 times in book 5, and then 50 times in book 6. While books 5 and 6 use 3 types of this lemma, book 4 uses 5.
            
# three
            
Based on my classics background and the results of these three counts across books 4-6, it seems that in this swath Pausanias is either 1) using book 4 to discuss Messenian and Spartan relations or 2) using book 4 to finish discussing those relations, and then the next two books have to do with Olympia, and I'd hazard a guess based on the prevalence of lemmas like νίκη and νικάω in book 6 that that's when he turns from general discussion of Olympia to discussing the games the city is famous for. I also think it's really interesting how in book 4, ἄγαλμα is used 26 times, and then in book 5 ἄγαλμα is used 70 times before dropping to 28 in the next book. The word itself can mean glory, honor, delight, or a cult statue, and I also see mention of Herakles in book 5. One could assume that Pausanias starts discussing ἄγαλμα in Olympia as it relates in both senses to Zeus and the related hero-cult of Herakles, but I don't think it totally goes away in the context of the Olympics, and you see εἰκών (likeness) in Book 6 as well, which may exist in other books, but is not in the top 100 of any other count. This leads me to believe they're using  ἄγαλμα more in the second word sense of a cult statue, but I would need to take a closer read to confirm that. 
            
Something else I thought was interesting was the drop in use of the pronoun σφεῖς. Thucydides uses it a good amount, but there's using it a good amount and then there's using it 158 times in book 4 and then using it less than half as frequently in the next two books. Not only that, but in addition to the σφᾶς and σφισιν seen in books 5 and 6, σφισι is occasionally present in book 4. To me, you use personal pronouns when you're describing people, and if you commonly do this in the plural sense, you're describing groups. The varied and increased presence of σφεῖς in book 4, along with the prevalence of Messenian and Spartan types and lemmata, set up a somewhat convincing case for the argument that the fraught history of Messenia and Sparta (and hopefully its happy ending?) are discussed in no small part in book 4.