# Investigating textual similarities between Homer and Plato's collected works

part 2 contains other similarity measures

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from datasketch import MinHash
import unicodedata
from gensim.models import FastText
import numpy as np

# Util Functions

In [None]:
def make_sentences_str(text):
    """
    Turns a text in a list of sentences. Note that Ancient Greek uses different punctuation! 
    :param text: str
    :return: list of strings
    """
    sentences = []
    sentence = ""
    
    text = text.replace('.', ' .').replace(';', ' ;')
    tokens = text.split()
    for word in tokens:
        if word in [';', '.']:
            if sentence:
                sentences.append(sentence[:-1])
                sentence = ""
        else:
            sentence += word+" "
    return sentences

In [None]:
def make_sentences_list(text):
    """
    Turns a text in a list of sentences. Note that Ancient Greek uses different punctuation! 
    :param text: str
    :return: list of lists
    """
    sentences = []
    sentence = []
    
    text = text.replace('.', ' .').replace(';', ' ;')
    tokens = text.split()
    for word in tokens:
        if word in [';', '.']:
            if sentence:
                sentences.append(sentence[:-1])
                sentence = []
        else:
            sentence.append(word)
    return sentences

In [None]:
def get_sentence_pairs_above_threshold(df, sentences1, sentences2, threshold):
    """Get all sentence pairs with similarity above the given threshold."""
    pairs = []
    for i, row in df.iterrows():
        for j, similarity in row.items():
            if similarity > threshold:
                pairs.append({
                    "sentence1": sentences1[i],
                    "sentence2": sentences2[j],
                    "similarity": similarity
                })
    return pairs

# Comparison TF/IDF

In [None]:
def calculate_tfidf_similarity(text1, text2):
    sentences1 = make_sentences_str(text1)
    sentences2 = make_sentences_str(text2)
    
    all_sentences = sentences1 + sentences2
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_sentences)
    
    tfidf1 = tfidf_matrix[:len(sentences1)]
    tfidf2 = tfidf_matrix[len(sentences1):]

    similarity_matrix = cosine_similarity(tfidf1, tfidf2)

    df = pd.DataFrame(
        similarity_matrix,
        index=range(len(sentences1)),
        columns=range(len(sentences2))
    )

    return df, sentences1, sentences2

# Comparison using FastText

In [None]:
def calculate_fasttext_similarity(text1, text2):
    sentences1 = make_sentences_list(text1)
    sentences2 = make_sentences_list(text2)
    
    model = FastText.load('fasttext_model.bin')
    keyed_vectors = model.wv
    vector_size = model.vector_size

    def get_sentence_embedding(sentence):
        if not sentence:  # Empty sentence
            return np.zeros(vector_size)
        
        vectors = []
        for word in sentence:
            normalized_word = unicodedata.normalize("NFC", word)
            if normalized_word in keyed_vectors:
                vectors.append(keyed_vectors[normalized_word])
        
        if vectors:
            return sum(vectors) / len(vectors)
        else:
            return np.zeros(vector_size)

    embeddings1 = [get_sentence_embedding(sentence) for sentence in sentences1]
    embeddings2 = [get_sentence_embedding(sentence) for sentence in sentences2]

    similarity_matrix = cosine_similarity(embeddings1, embeddings2)

    df = pd.DataFrame(
        similarity_matrix,
        index=range(len(sentences1)),
        columns=range(len(sentences2))
    )

    return df, sentences1, sentences2

# Comparison using MinHash

In [None]:
def calculate_minhash_similarity(text1, text2):
    sentences1 = [sentence for sentence in make_sentences_list(text1) if sentence]
    sentences2 = [sentence for sentence in make_sentences_list(text2) if sentence]

    if not sentences1 or not sentences2:
        empty_df = pd.DataFrame()
        return empty_df, sentences1, sentences2

    def get_minhash(sentence):
        minhash = MinHash()
        for word in sentence:
            minhash.update(word.encode('utf8'))
        return minhash

    minhashes1 = [get_minhash(sentence) for sentence in sentences1]
    minhashes2 = [get_minhash(sentence) for sentence in sentences2]

    similarity_matrix = [
        [m1.jaccard(m2) for m2 in minhashes2]
        for m1 in minhashes1
    ]

    df = pd.DataFrame(
        similarity_matrix,
        index=range(len(sentences1)),
        columns=range(len(sentences2))
    )

    return df, sentences1, sentences2

# Combining Data and Functions

# Data

In [None]:
with open('Iliad_lemmatized.txt', 'r', encoding="utf-8") as f:
    iliad = f.read()
with open('Odyssey_lemmatized.txt', 'r', encoding="utf-8") as f:
    odyssey = f.read()

In [None]:
from nltk.corpus import CategorizedPlaintextCorpusReader
path = 'Lemmatized_Data'
mycorpus = CategorizedPlaintextCorpusReader(path, r'.*\.txt', cat_pattern=r'(.*?)_.*')

# Iliad

In [None]:
columns = ['text1', 'text2', 'sentence1','sentence2','similarity']
all_rows = []
for f in mycorpus.fileids():
    print(f"Processing {f}...")

    text2 = ' '.join(mycorpus.words(fileids=f))
    tfidf_df, sentences1, sentences2 = calculate_tfidf_similarity(iliad, text2)
    pairs = get_sentence_pairs_above_threshold(tfidf_df, sentences1, sentences2, 0.5)

    for pair in pairs:
        all_rows.append(['Iliad', f, pair['sentence1'], pair['sentence2'], pair['similarity']])
            
df_tfidf = pd.DataFrame(all_rows, columns=columns)


In [None]:
df_tfidf

In [None]:
columns = ['text1', 'text2', 'sentence1','sentence2','similarity']
all_rows = []
for f in mycorpus.fileids():
    print(f"Processing {f}...")

    text2 = ' '.join(mycorpus.words(fileids=f))
    fasttext_df, sentences1, sentences2 = calculate_fasttext_similarity(iliad, text2)
    pairs = get_sentence_pairs_above_threshold(fasttext_df, sentences1, sentences2, 0.92)

    for pair in pairs:
        all_rows.append(['Iliad', f, pair['sentence1'], pair['sentence2'], pair['similarity']])
            
df_fasttext = pd.DataFrame(all_rows, columns=columns)

In [None]:
df_fasttext

In [None]:
columns = ['text1', 'text2', 'sentence1','sentence2','similarity']
all_rows = []
for f in mycorpus.fileids():
    print(f"Processing {f}...")

    text2 = ' '.join(mycorpus.words(fileids=f))
    minhash_df, sentences1, sentences2 = calculate_minhash_similarity(iliad, text2)
    pairs = get_sentence_pairs_above_threshold(minhash_df, sentences1, sentences2, 0.50)

    for pair in pairs:
        all_rows.append(['Iliad', f, pair['sentence1'], pair['sentence2'], pair['similarity']])
            
df_minhash = pd.DataFrame(all_rows, columns=columns)

In [None]:
df_minhash

# Odyssey

In [None]:
columns = ['text1', 'text2', 'sentence1', 'sentence2', 'similarity']
all_rows = []
for f in mycorpus.fileids():
    print(f"Processing {f}...")

    text2 = ' '.join(mycorpus.words(fileids=f))
    tfidf_df, sentences1, sentences2 = calculate_tfidf_similarity(iliad, text2)
    pairs = get_sentence_pairs_above_threshold(tfidf_df, sentences1, sentences2, 0.5)

    for pair in pairs:
        all_rows.append(['Iliad', f, pair['sentence1'], pair['sentence2'], pair['similarity']])

df_tfidf = pd.DataFrame(all_rows, columns=columns)
df_tfidf

In [None]:
columns = ['text1', 'text2', 'sentence1','sentence2','similarity']
all_rows = []
for f in mycorpus.fileids():
    print(f"Processing {f}...")

    text2 = ' '.join(mycorpus.words(fileids=f))
    fasttext_df, sentences1, sentences2 = calculate_fasttext_similarity(iliad, text2)
    pairs = get_sentence_pairs_above_threshold(fasttext_df, sentences1, sentences2, 0.92)

    for pair in pairs:
        all_rows.append(['Iliad', f, pair['sentence1'], pair['sentence2'], pair['similarity']])
            
df_fasttext = pd.DataFrame(all_rows, columns=columns)
df_fasttext

In [None]:
columns = ['text1', 'text2', 'sentence1','sentence2','similarity']
all_rows = []
for f in mycorpus.fileids():
    print(f"Processing {f}...")

    text2 = ' '.join(mycorpus.words(fileids=f))
    minhash_df, sentences1, sentences2 = calculate_minhash_similarity(iliad, text2)
    pairs = get_sentence_pairs_above_threshold(minhash_df, sentences1, sentences2, 0.65)

    for pair in pairs:
        all_rows.append(['Iliad', f, pair['sentence1'], pair['sentence2'], pair['similarity']])
            
df_minhash = pd.DataFrame(all_rows, columns=columns)
df_minhash