# Investigating textual similarities between Homer and Plato's collected works

part 1 contains preprocessing and bi/trigram comparison

# Set up

In [None]:
import xml.etree.ElementTree as ET
import stanza
from pathlib import Path
from nltk.tokenize.punkt import PunktLanguageVars
from nltk.util import bigrams
from nltk.util import trigrams
from nltk.tokenize import sent_tokenize
import nltk
import numpy
from collections import Counter
import os
import pandas as pd
import seaborn as sns

In [None]:
nltk.download('punkt')

# Global Variables

In [None]:
path_iliad = 'Homer (0012) - Iliad (001).xml'
path_odyssey = 'Homer (0012) - Odyssey (002).xml'
text_folder = Path('Text_Data/')
lemmatized_folder = Path('Lemmatized_Data/')

# Let's preprocess our data

# For Homer, we extract the already existing Lemmata

In [None]:
stopwords = [
    "μή", "ἑαυτοῦ", "ἄν", "ἀλλ'", "ἀλλά", "ἄλλος", "ἀπό", "ἄρα", "αὐτός", "δ'", 
    "δέ", "δή", "διά", "δαί", "δαίς", "ἔτι", "ἐγώ", "ἐκ", "ἐμός", "ἐν", 
    "ἐπί", "εἰ", "εἰμί", "εἴμι", "εἰς", "γάρ", "γε", "γα", "ἡ", "ἤ", 
    "καί", "κατά", "μέν", "μετά", "μή", "ὁ", "ὅδε", "ὅς", "ὅστις", "ὅτι", 
    "οὕτως", "οὗτος", "οὔτε", "οὖν", "οὐδείς", "οἱ", "οὐ", "οὐδέ", "οὐκ", "περί", 
    "πρός", "σύ", "σύν", "τά", "τε", "τήν", "τῆς", "τῇ", "τι", "τί", 
    "τις", "τίς", "τό", "τοί", "τοιοῦτος", "τόν", "τούς", "τοῦ", "τῶν", "τῷ", 
    "ὑμός", "ὑπέρ", "ὑπό", "ὡς", "ὦ", "ὥστε", "ἐάν", "παρά", "σός"
]

In [None]:
tree = ET.parse(path_iliad)
root = tree.getroot()

lemmatized_iliad = []
for sentence in root.findall(".//body/sentence"):
    if sentence is not None:
        for child in sentence:
            if child.tag == "word":
                lemma = child.find("lemma")
                if lemma is not None and lemma.get("id") != "unknown":
                    word = lemma.get("entry")
                    if word not in stopwords:
                        lemmatized_iliad.append(word)
            elif child.tag == "punct":
                mark = child.get("mark")
                if mark is not None and mark not in ",—᾽'":
                    lemmatized_iliad.append(child.get("mark"))
with open('Iliad_lemmatized.txt', 'w', encoding="utf-8") as f:
        f.write(" ".join(lemmatized_iliad))

In [None]:
tree = ET.parse(path_odyssey)
root = tree.getroot()

lemmatized_odyssey = []
for sentence in root.findall(".//body/sentence"):
    if sentence is not None:
        for child in sentence:
            if child.tag == "word":
                lemma = child.find("lemma")
                if lemma is not None and lemma.get("id") != "unknown":
                    word = lemma.get("entry")
                    if word not in stopwords:
                        lemmatized_odyssey.append(word)
            elif child.tag == "punct":
                mark = child.get("mark")
                if mark is not None and mark not in ",—᾽'":
                    lemmatized_odyssey.append(child.get("mark"))
        
with open('Odyssey_lemmatized.txt', 'w', encoding="utf-8") as f:
        f.write(" ".join(lemmatized_odyssey))

# For Platos Works, we use a Lemmatizer

For the processing of Ancient Greek I rely on the Stanza package as documented on https://stanfordnlp.github.io/stanza/ 

In [None]:
nlp = stanza.Pipeline('grc', processors='tokenize, lemma')

In [None]:
def lemmatize_texts(text):
    """
    lemmatizes a piece of text. Punctuation and stopwords are removed.
    :param text: str, text
    :return: str, lemmatized text
    """
    lemmatized_data = nlp(text)
    lemmatized_text = []
    for sentence in lemmatized_data.sentences:
        for word in sentence.words:
            if not word.lemma in ",—᾽'" and not word.lemma in stopwords:
                lemmatized_text.append(word.lemma)
    return " ".join(lemmatized_text)
        

In [None]:
texts = []
texts.extend(sorted(list(text_folder.glob('*'))))

for text_file in texts:
    with open(text_file, 'r', encoding="utf-8") as f:
        t = f.read()
    l_t = lemmatize_texts(t)
    with open(lemmatized_folder / text_file.name, 'w', encoding="utf-8") as f:
        f.write(l_t)

# Let's start our Comparison

# Let's load our preprocessed data

In [None]:
with open('Iliad_lemmatized.txt', 'r', encoding="utf-8") as f:
    iliad = f.read()
with open('Odyssey_lemmatized.txt', 'r', encoding="utf-8") as f:
    odyssey = f.read()

In [None]:
lemmatized_texts = []
lemmatized_texts.extend(sorted(list(lemmatized_folder.glob('*'))))

plato_works = {}

for lemmatized_text in lemmatized_texts:
    with open(lemmatized_text, 'r', encoding="utf-8") as f:
        l_t = f.read()
    name = lemmatized_text.name
    plato_works[name] = l_t

# Bigrams

In [None]:
def compare_bigrams(target, comparison):
    """
    checks for common bigrams
    :param target: str, reference text
    :param comparison: str, comparison text
    :return: tuple of the form (common bigrams (Counter Object), number of common bigrams, number of comparison bigrams)
    """
    p = PunktLanguageVars()
    target = target.replace(',', '').replace('.', '').replace(':', '').replace(';', '')
    target_bigrams = Counter(bigrams(p.word_tokenize(target)))
    comparison = comparison.replace(',', '').replace('.', '').replace(':', '').replace(';', '')
    comparison_bigrams = Counter(bigrams(p.word_tokenize(comparison)))
    overlap = target_bigrams & comparison_bigrams
    total_count = sum(overlap.values())
    relative_count = total_count / sum(comparison_bigrams.values())
    return overlap, total_count, relative_count


In [None]:
iliad_plato_bigram_overlap = [[name, compare_bigrams(iliad, value)[0], compare_bigrams(iliad, value)[1], compare_bigrams(iliad, value)[2]] for name, value in plato_works.items()]
data_bi_iliad = pd.DataFrame(iliad_plato_bigram_overlap)    
data_bi_iliad

In [None]:
plot = sns.barplot(x=data_bi_iliad.iloc[:, 0], y=data_bi_iliad.iloc[:, 3]*100)
plot.set_xticks(range(len(data_bi_iliad.iloc[:, 0]))) 
plot.set_xticklabels(plot.get_xticklabels(), rotation=90, ha="right")

plot.set_title("Percentage of Bigram Overlap between the Iliad and Plato's work")
sns.despine()

In [None]:
odyssey_plato_bigram_overlap = [[name, compare_bigrams(odyssey, value)[0], compare_bigrams(odyssey, value)[1], compare_bigrams(odyssey, value)[2]] for name, value in plato_works.items()]
data_bi_odyssey = pd.DataFrame(odyssey_plato_bigram_overlap)    
data_bi_odyssey

In [None]:
plot = sns.barplot(x=data_bi_odyssey.iloc[:, 0], y=data_bi_odyssey.iloc[:, 3]*100)
plot.set_xticks(range(len(data_bi_odyssey.iloc[:, 0]))) 
plot.set_xticklabels(plot.get_xticklabels(), rotation=90, ha="right")

plot.set_title("Percentage of Bigram Overlap between the Odyssey and Plato's work")
sns.despine()

# Trigrams


In [None]:
def compare_trigrams(target, comparison):
    """
    checks for common trigrams
    :param target: str, reference text
    :param comparison: str, comparison text
    :return: tuple of the form (common trigrams (Counter Object), number of common trigrams, number of comparison trigrams)
    """
    p = PunktLanguageVars()
    target = target.replace(',', '').replace('.', '').replace(':', '').replace(';', '')
    target_trigrams = Counter(trigrams(p.word_tokenize(target)))
    comparison = comparison.replace(',', '').replace('.', '').replace(':', '').replace(';', '')
    comparison_trigrams = Counter(trigrams(p.word_tokenize(comparison)))
    overlap = target_trigrams & comparison_trigrams
    total_count = sum(overlap.values())
    relative_count = total_count / sum(comparison_trigrams.values())
    return overlap, total_count, relative_count

In [None]:
iliad_plato_trigram_overlap = [[name, compare_trigrams(iliad, value)[0], compare_trigrams(iliad, value)[1], compare_trigrams(iliad, value)[2]] for name, value in plato_works.items()]
data_tri_iliad = pd.DataFrame(iliad_plato_trigram_overlap)    
data_tri_iliad

In [None]:
plot = sns.barplot(x=data_tri_iliad.iloc[:, 0], y=data_tri_iliad.iloc[:, 3]*100)
plot.set_xticks(range(len(data_tri_iliad.iloc[:, 0]))) 
plot.set_xticklabels(plot.get_xticklabels(), rotation=90, ha="right")

plot.set_title("Percentage of Trigram Overlap between the Iliad and Plato's work")
sns.despine()

In [None]:
odyssey_plato_trigram_overlap = [[name, compare_trigrams(odyssey, value)[0], compare_trigrams(odyssey, value)[1], compare_trigrams(odyssey, value)[2]] for name, value in plato_works.items()]
data_tri_odyssey = pd.DataFrame(odyssey_plato_trigram_overlap)    
data_tri_odyssey

In [None]:
plot = sns.barplot(x=data_tri_odyssey.iloc[:, 0], y=data_tri_odyssey.iloc[:, 3]*100)
plot.set_xticks(range(len(data_tri_odyssey.iloc[:, 0]))) 
plot.set_xticklabels(plot.get_xticklabels(), rotation=90, ha="right")

plot.set_title("Percentage of Trigram Overlap between the Odyssey and Plato's work")
sns.despine()

# Let's look which sentences (measured by bi/trigrams) are similar

In [None]:
def make_sentences(text):
    """
    Turns a text in a list of sentences. Note that Ancient Greek uses different punctuation! 
    :param text: str
    :return: list of strings
    """
    sentences = []
    sentence = ""
    
    text = text.replace('.', ' .').replace(';', ' ;')
    tokens = text.split()
    for word in tokens:
        if word in [';', '.']:
            if sentence:
                sentences.append(sentence[:-1])
                sentence = ""
        else:
            sentence += word+" "
    return sentences

In [None]:
iliad_sentences = make_sentences(iliad)

odyssey_sentences = make_sentences(odyssey)

plato_sentences = []
for work in plato_works:
    sentences = make_sentences(plato_works[work])
    for line in sentences:
        plato_sentences.append([line, work])

In [None]:
def make_bg_list(sentence):
    """
    computes all bigrams in a sentence.
    :param sentence: str,
    :return: list of bigrams
    """
    p = PunktLanguageVars()
    return list(bigrams(p.word_tokenize(sentence)))

In [None]:
iliad_sentences_with_bigram = []
for s in iliad_sentences:
    iliad_sentences_with_bigram.append([make_bg_list(s), s])

odyssey_sentences_with_bigram = []
for s in odyssey_sentences:
    odyssey_sentences_with_bigram.append([make_bg_list(s), s])

plato_sentences_with_bigram = []
for s in plato_sentences:
    if make_bg_list(s[0]):
        plato_sentences_with_bigram.append([make_bg_list(s[0]), s[0], s[1]])

# Comparison Iliad

In [None]:
iliad_bigram_sets = [set(row[0]) for row in iliad_sentences_with_bigram]
plato_bigram_sets = [set(row[0]) for row in plato_sentences_with_bigram]

ScoreMatrix = numpy.zeros((len(iliad_sentences_with_bigram), len(plato_sentences_with_bigram)))

for i, iliad_bigrams in enumerate(iliad_bigram_sets):
    for j, plato_bigrams in enumerate(plato_bigram_sets):
        ScoreMatrix[i, j] = len(iliad_bigrams & plato_bigrams) 

In [None]:
row_sums = numpy.sum(ScoreMatrix , axis=1)
top_10_indices = numpy.argsort(row_sums)[-10:][::-1]
top_10_rows = ScoreMatrix[top_10_indices]
top_5_indices_per_row = numpy.argsort(top_10_rows, axis=1)[:, -5:][:, ::-1]

# Output
print("Indices of the top 5 values in each of the top 10 rows:")
for i, indices in enumerate(top_10_indices):
    print('\nSentence in the Iliad')
    print(iliad_sentences_with_bigram[indices][1])
    print("\nSimilar Sentences in Plato's works")
    for j in top_5_indices_per_row[i,:]:
        print(plato_sentences_with_bigram[j][1:])


# Comparison Odyssey

In [None]:
odyssey_bigram_sets = [set(row[0]) for row in odyssey_sentences_with_bigram]
plato_bigram_sets = [set(row[0]) for row in plato_sentences_with_bigram]

ScoreMatrix = numpy.zeros((len(odyssey_sentences_with_bigram), len(plato_sentences_with_bigram)))

for i, odyssey_bigrams in enumerate(odyssey_bigram_sets):
    for j, plato_bigrams in enumerate(plato_bigram_sets):
        ScoreMatrix[i, j] = len(odyssey_bigrams & plato_bigrams)   

In [None]:
row_sums = numpy.sum(ScoreMatrix , axis=1)
top_10_indices = numpy.argsort(row_sums)[-10:][::-1]
top_10_rows = ScoreMatrix[top_10_indices]
top_5_indices_per_row = numpy.argsort(top_10_rows, axis=1)[:, -5:][:, ::-1]

# Output
print("Indices of the top 5 values in each of the top 10 rows:")
for i, indices in enumerate(top_10_indices):
    print('\nSentence in the Odyssey')
    print(iliad_sentences_with_bigram[indices][1])
    print("\nSimilar Sentences in Plato's works")
    for j in top_5_indices_per_row[i,:]:
        print(plato_sentences_with_bigram[j][1:])