# Query PTA

In [2]:
import re
import requests
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML

## PTA ad plaintext data

(This was generated with another script and exported to a CSV file.)

In [None]:
df = pd.read_csv('data/severian_plaintext.csv')


In [None]:
df

In [5]:
select = df.loc[df['urn'] == 'pta0001.pta017.pta-grc1', 'text']

In [6]:
selected_data = "".join(select.values)

## Dictionary queries with the [Classical Language Dictionary (CLD)](https://cld.bbaw.de)

In [101]:
def dictionary_wrapper(lemma,lang):
    URL = "https://cld.bbaw.de/api/dictionary/lemma/"+lemma
    PARAMS = {'language': lang, 'options': 'regex'}
    r = requests.get(url = URL, params = PARAMS)
    data = r.json()
    return data

In [102]:
dictentry= dictionary_wrapper("ἀδελφός", "grc")

In [None]:
print(dictentry["data"])

In [None]:
for dictionary in dictentry["data"][0]["descriptions"]:
    print(dictionary["dictionary"])
    display(HTML(dictionary["description"]))

## Analyse a text with the help of the [Classical Language Dictionary (CLD)](https://cld.bbaw.de)

Disclaimer: Currently the CLD only allows a maximum length of 1500 characters per query. So, we need to use some workarounds. CLD currently also does use outdated versions of the spacy models, so there are more errors. And CLD does currently not give access to all information provided by the spacy model.


In [7]:
def slice_text(text, max_length=1000):
    slices = []
    start = 0

    while start < len(text):
        end = start + max_length
        
        # If the end exceeds the text length, set it to the text length
        if end >= len(text):
            slices.append(text[start:])
            break
        
        # Find the last punctuation mark before the end limit
        last_punctuation = max(text.rfind(p, start, end) for p in '·;.')

        # If no punctuation is found, we have to slice at max_length
        if last_punctuation == -1:
            last_punctuation = end
        
        # Append the slice and update the start position
        slices.append(text[start:last_punctuation + 1])  # Include the punctuation
        start = last_punctuation + 1  # Move past the punctuation

    return slices

In [8]:
def analyse_wrapper(text):
    URL = "https://cld.bbaw.de/api/analyze"
    # max char length of 1500
    # defining a params dict for the parameters to be sent to the API, all parameters: https://cld.bbaw.de/api/analyze/parameters
    PARAMS = {'text':text, 'model':'grc_proiel_lg', 'attributes': 'lemma_|pos_|tag_|ent_type_|dep_|morph|dep_|head|is_stop|is_punct|is_sent_start|is_sent_end|vector', 'output':'list'}
    r = requests.post(url = URL, params = PARAMS)
    data = r.json()
    return data

In [9]:
slices = slice_text(selected_data)

In [10]:
doc = []
for entry in slices:
    analysis = analyse_wrapper(entry)
    doc.extend(analysis["data"])


In [144]:
analyzed = pd.DataFrame(doc)


In [None]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
display(analyzed)

### Simple questions answered using the analysis

#### What are the most used words in the text?

without stop words

In [None]:
words = [token["lemma"]
        for token in doc
        if not token["is_stop"] and not token["is_punct"]]

word_counts = Counter(words)

wordcloud = WordCloud(width=800, height=400, font_path='data/DejaVuSans.ttf').generate_from_frequencies(dict(word_counts))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
most_frequent_words = word_counts.most_common(20)
words, frequencies = zip(*most_frequent_words)

# Plot the most frequent words
plt.figure(figsize=(10, 5))
plt.bar(words, frequencies)
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.title('20 most Frequent Words')
plt.xticks(rotation=90)

# Add the number of mentions to each bar
for i, freq in enumerate(frequencies):
    plt.text(i, freq + 1, str(freq), ha='center')

plt.tight_layout()
plt.show()

#### Are the named entities in the text?

In [None]:
for token in doc:
    if token["ent_type"]:
        print(token["text"],token["ent_type"])

#### Which are the 10 most used verbs, nouns and adjectives? 

In [None]:
nouns = []
adjectives = []
verbs = []
for token in doc:
    if not token["is_stop"] and token["pos"] == "NOUN":
        nouns.append(token["lemma"])
    if not token["is_stop"] and token["pos"] == "ADJ":
        adjectives.append(token["lemma"])
    if not token["is_stop"]  and token["pos"] == "VERB":
        verbs.append(token["lemma"])

print(Counter(verbs).most_common(10))
print(Counter(nouns).most_common(10))
print(Counter(adjectives).most_common(10))

#### What combinations of substantives and adjectives are in the text?

For the abbreviations used for the dependency cf. https://downloads.cs.stanford.edu/nlp/software/dependencies_manual.pdf

In [None]:
# token.dep_ = amod und passendes token.head.i bzw. token.head.text
pairs = []
for token in doc: 
    if not token["is_punct"]:
        if token["dep"] == "amod":
            #print(doc[token.i].similarity(doc[token.head.i]))
            pairs.append(token["text"]+" "+token["head"])
counter = Counter(pairs)
for element, count in counter.most_common():
    print(f"{element:<25}: {count}x")

#### What does the text say about Jews and heretics?

We want to get all instances when Jew or heretic is in the nominative; we also want to get the verb that is used to describe their action.

In [None]:
# Jews and Haeretics
for token in doc:
    if token["lemma"] == "Ἰουδαῖος" and "Case=Nom" in token["morph"]: #and token.dep_ == "nsubj":
        print(token["text"], token["head"])
    if token["lemma"] == "αἱρετικός" and "Case=Nom" in token["morph"]:
        print(token["text"], token["head"])