In [1]:
"""
Created on Auguste 2024
@author: Théotime de la Selle
"""

import pandas as pd
import numpy as np
from tqdm import tqdm

import nltk as nl

import cltk
from cltk import NLP



In [2]:
# import os
# os.path.abspath(cltk.__file__)

# Load and verify data

In [3]:
# load (reading csv file) and format data
def load_bible_texts(path):
    df = pd.read_csv(path,delimiter='\t', skiprows=(0),dtype=str,header=1)
    df_rows = df.shape[0]
    display(df.sample(3))
    print("Nomber of verses :",df_rows)
    return df,df_rows

Mark_accented,Mc_verses = load_bible_texts("data/Mark.txt")
Matt_accented,Mt_verses = load_bible_texts("data/Matt.txt")
Luke_accented,Lc_verses = load_bible_texts("data/Luke.txt")
John_accented,Jn_verses = load_bible_texts("data/John.txt")


import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

def unaccented_text(pd):
    pd_unaccented = pd
    for i in tqdm(range(pd.shape[0])):
        pd_unaccented.text[i] = strip_accents(pd.text[i])
    return pd_unaccented

# Mark = unaccented_text(Mark_accented)
# Matt = unaccented_text(Matt_accented)
# Luke = unaccented_text(Luke_accented)
# John = unaccented_text(John_accented)

Mark = Mark_accented
Matt = Matt_accented
Luke = Luke_accented
John = John_accented

df_concat = [Matt,Mark,Luke,John]
Evangiles = pd.concat(df_concat)
Evangiles.reset_index(drop=True,inplace=True)
Ev_verses = Evangiles.shape[0]
display(Evangiles.head(5))

Unnamed: 0,verses,text
162,Mark 5:14,⸂Καὶ οἱ⸃ βόσκοντες ⸀αὐτοὺς ἔφυγον καὶ ⸀ἀπήγγει...
610,Mark 15:4,ὁ δὲ Πιλᾶτος πάλιν ⸀ἐπηρώτα αὐτὸν λέγων· Οὐκ ἀ...
291,Mark 8:8,"⸂καὶ ἔφαγον⸃ καὶ ἐχορτάσθησαν, καὶ ἦραν περισσ..."


Nomber of verses : 673


Unnamed: 0,verses,text
254,Matt 9:20,Καὶ ἰδοὺ γυνὴ αἱμορροοῦσα δώδεκα ἔτη προσελθοῦ...
222,Matt 8:22,"ὁ δὲ Ἰησοῦς ⸀λέγει αὐτῷ· Ἀκολούθει μοι, καὶ ἄφ..."
518,Matt 15:30,καὶ προσῆλθον αὐτῷ ὄχλοι πολλοὶ ἔχοντες μεθʼ ἑ...


Nomber of verses : 1068


Unnamed: 0,verses,text
269,Luke 6:17,"Καὶ καταβὰς μετʼ αὐτῶν ἔστη ἐπὶ τόπου πεδινοῦ,..."
330,Luke 7:29,(καὶ πᾶς ὁ λαὸς ἀκούσας καὶ οἱ τελῶναι ἐδικαίω...
276,Luke 6:24,"πλὴν οὐαὶ ὑμῖν τοῖς πλουσίοις, ὅτι ἀπέχετε τὴν..."


Nomber of verses : 1149


Unnamed: 0,verses,text
340,John 8:5,ἐν δὲ τῷ νόμῳ Μωσῆς ἡμῖν ἐνετείλατο τὰς τοιαύτ...
371,John 8:36,"ἐὰν οὖν ὁ υἱὸς ὑμᾶς ἐλευθερώσῃ, ὄντως ἐλεύθερο..."
473,John 10:38,"εἰ δὲ ποιῶ, κἂν ἐμοὶ μὴ πιστεύητε τοῖς ἔργοις ..."


Nomber of verses : 878


Unnamed: 0,verses,text
0,Matt 1:1,Βίβλος γενέσεως Ἰησοῦ χριστοῦ υἱοῦ Δαυὶδ υἱοῦ ...
1,Matt 1:2,"Ἀβραὰμ ἐγέννησεν τὸν Ἰσαάκ, Ἰσαὰκ δὲ ἐγέννησεν..."
2,Matt 1:3,Ἰούδας δὲ ἐγέννησεν τὸν Φαρὲς καὶ τὸν Ζάρα ἐκ ...
3,Matt 1:4,"Ἀρὰμ δὲ ἐγέννησεν τὸν Ἀμιναδάβ, Ἀμιναδὰβ δὲ ἐγ..."
4,Matt 1:5,"Σαλμὼν δὲ ἐγέννησεν τὸν ⸂Βόες ἐκ τῆς Ῥαχάβ, Βό..."


# Data standardization

In [4]:
def standardize(df):
    
    # Normalize greek text and remove critical apparatus characters
    from cltk.alphabet.grc import normalize_grc
    from cltk.alphabet.grc import tonos_oxia_converter
    from cltk.alphabet.grc import drop_critical_apparatus_char
    from cltk.alphabet.grc import filter_non_greek
    from cltk.alphabet.grc import expand_iota_subscript
    df.text = [expand_iota_subscript(txt) for txt in df.text]
    df.text = [drop_critical_apparatus_char(txt) for txt in df.text]
    df.text = [filter_non_greek(txt) for txt in df.text]
    # df.text = [tonos_oxia_converter(txt) for txt in df.text]
    df.text = [normalize_grc(txt) for txt in df.text]

    # Manually remove critical apparatus symbols
    # crit_symbols_list = ['⸀', '⸂','⸃','⟧','⟦'] # list of symbols
    # df.text = df.text.replace(crit_symbols_list,'',regex=True)
    
    # Lower case
    df.text = df.text.str.lower()
    return df

Mark = standardize(Mark)
Matt = standardize(Matt)
Luke = standardize(Luke)
John = standardize(John)
Evangiles = standardize(Evangiles)

# Control
Evangiles.text.sample(5)

2783    ἤρξαντο δὲ κατηγορεῖν αὐτοῦ λέγοντες τοῦτον εὕ...
2625    καὶ οὐχ εὕρισκον τὸ τί ποιήσωσιν ὁ λαὸς γὰρ ἅπ...
1933    καὶ εἶπεν πρὸς αὐτούς πάντως ἐρεῖτέ μοι τὴν πα...
3658    ἐξῆλθεν οὖν ὁ πιλᾶτος ἔξω πρὸς αὐτοὺς καὶ φησί...
3029    ἀφῆκεν οὖν τὴν ὑδρίαν αὐτῆς ἡ γυνὴ καὶ ἀπῆλθεν...
Name: text, dtype: object

# Cltk pipeline application

In [5]:
cltk_nlp_grc = NLP(language="grc")

‎𐤀 CLTK version '1.3.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekSpacyProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.

⸖ ``GreekSpacyProcess`` using OdyCy model by Center for Humanities Computing Aarhus from https://huggingface.co/chcaa . Please cite: https://aclanthology.org/2023.latechclfl-1.14
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [6]:
Mc_cltk_doc = [cltk_nlp_grc.analyze(text=Mark.text[i]) for i in tqdm(range(Mc_verses),desc="Mark")]
Mt_cltk_doc = [cltk_nlp_grc.analyze(text=Matt.text[i]) for i in tqdm(range(Mt_verses))]
Lc_cltk_doc = [cltk_nlp_grc.analyze(text=Luke.text[i]) for i in tqdm(range(Lc_verses))]
Jn_cltk_doc = [cltk_nlp_grc.analyze(text=John.text[i]) for i in tqdm(range(Jn_verses))]
Ev_cltk_doc = [cltk_nlp_grc.analyze(text=Evangiles.text[i]) for i in tqdm(range(Ev_verses))]

Mark: 100%|██████████████████████████████████| 673/673 [00:06<00:00, 101.83it/s]
100%|██████████████████████████████████████| 1068/1068 [00:05<00:00, 179.70it/s]
100%|██████████████████████████████████████| 1149/1149 [00:06<00:00, 167.26it/s]
100%|████████████████████████████████████████| 878/878 [00:05<00:00, 174.67it/s]
100%|██████████████████████████████████████| 3768/3768 [00:22<00:00, 170.26it/s]


# Dataframe pre-processing from Cltk doc

In [7]:
# Definition of the additionnal stop words list (especially for lemmata)
added_stop_words = ['δεῖ','ὧδε','ἐγώ','ἕως','ἀλλ','ἐάν','ἕξ','κατά','καί','αὐτός','μετά','αὐτὸν', 'εὐθύς','σύ', "τότε","πᾶσα","πᾶς","ἵνα","ὅς","τίς","τις","ἀπό","μή",'τῶι','ὑπ',"πῶς","ὅταν",'ἐπί',"δ","εἷς","οὗτος","πρός","πρὸς","πρό","οὖς","ὅτε","γάρ","δέ","πᾶς"]
# + 'εἶμι' + 'εὖ' + 'ποῦ' ?

In [8]:
def text_dataframe_processing(df,doc):

    print("-------- Processing of dataframe ---------")  

    # ----- Remove stop words and punctuation for adding filtered tokens and lemmata to dataframe
    from cltk.stops.words import Stops
    from cltk.text.processes import DefaultPunctuationRemovalProcess
    from cltk.lemmatize.grc import GreekBackoffLemmatizer
    
    Punct_filter = DefaultPunctuationRemovalProcess(language='grc')
    doc = [Punct_filter.run(d) for d in doc]
    
    # ----- Add tokens lemmata, and tokens_stops_filtered in dataframe
    df['tokens'] = [d.tokens for d in tqdm(doc,desc="Tokens")]
    df['lemmata'] = [d.lemmata for d in tqdm(doc,desc="Lemmata")]
    df['tokens_filtered'] = [d.tokens_stops_filtered for d in tqdm(doc,desc="Tokens filtered")]
     
    lemmatizer = GreekBackoffLemmatizer() # we have to lemmatize the filtered tokens
    tokens_lemmata_filtered = [lemmatizer.lemmatize(lem) for lem in df.tokens_filtered]
    lemmata_filtered = []
    for lem in tokens_lemmata_filtered :    
        lemmata_filtered.append([l[1] for l in lem])

    # Removing stop words from lemmata (based on cltk.stops.words process but only through extra_stops additionnal list as remove_stopwords doesn't work for lemmata)
    stops_obj = Stops(iso_code="grc")
    df['lemmata_filtered'] = [stops_obj.remove_stopwords(tokens=lem, extra_stops=added_stop_words) for lem in tqdm(lemmata_filtered,desc="Lemmata filtered")]
    
    #----- Add n-grams
    df['bigrams'] = [list(nl.bigrams(lem)) for lem in tqdm(df.lemmata_filtered,desc="Bigrams")]
    df['trigrams'] = [list(nl.trigrams(lem)) for lem in tqdm(df.lemmata_filtered,desc="Trigrams")]

    # ----- Add tf-idf score for each lemmata
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer(
        analyzer='word',
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None)  

    # Learn vocabulary and idf, return document-term matrix. 
    doc_term_matrix = tfidf.fit_transform(df.lemmata_filtered)
    tfidf_values = [doc_term_matrix[i,j] for i, j in zip(*doc_term_matrix.nonzero())]
    id_tfidf = [i for i, j in zip(*doc_term_matrix.nonzero())]
    tfidf_verses = []
    for i in tqdm(range(df.shape[0]),desc="tfidf"):
        tfidf_verses.append([tfidf_values[index] for (index, item) in enumerate(id_tfidf) if item == i])
    df['lemmata_tfidf'] = [tf for tf in tfidf_verses]

    # ---- Add part-of-speech feature in dataframe
    pos = []
    for i in tqdm(range(df.shape[0]),desc="Part-of-Speech"):
        pos.append([str(word.pos) for word in doc[i].words])   
    df['pos'] = pos

    # ---- Add morphosyntactic features in dataframe
    df['morpho'] = [d.morphosyntactic_features for d in tqdm(doc,desc="Morphosyntactic features")]    

    # ----- Add vocabulary feature in dataframe
    from nltk.lm import Vocabulary
    vocab = []
    for i in tqdm(range(df.shape[0]),desc="Vocabulary"):
        vocab.append(sorted(Vocabulary(df.lemmata_filtered[i], unk_cutoff=1).counts))
    
    df['vocabulary'] = vocab
    
    return df

Mark = text_dataframe_processing(Mark,Mc_cltk_doc)
Matt = text_dataframe_processing(Matt,Mt_cltk_doc)
Luke = text_dataframe_processing(Luke,Lc_cltk_doc)
John = text_dataframe_processing(John,Jn_cltk_doc)
Evangiles = text_dataframe_processing(Evangiles,Ev_cltk_doc)

display(Evangiles.sample(5)) # To controle pre-processing

-------- Processing of dataframe ---------


Tokens: 100%|█████████████████████████████| 673/673 [00:00<00:00, 246314.71it/s]
Lemmata: 100%|████████████████████████████| 673/673 [00:00<00:00, 151160.25it/s]
Tokens filtered: 100%|████████████████████| 673/673 [00:00<00:00, 105761.21it/s]
Lemmata filtered: 100%|████████████████████| 673/673 [00:00<00:00, 81890.53it/s]
Bigrams: 100%|████████████████████████████| 673/673 [00:00<00:00, 402390.11it/s]
Trigrams: 100%|███████████████████████████| 673/673 [00:00<00:00, 176478.06it/s]
tfidf: 100%|█████████████████████████████████| 673/673 [00:02<00:00, 233.58it/s]
Part-of-Speech: 100%|█████████████████████| 673/673 [00:00<00:00, 215873.86it/s]
Morphosyntactic features: 100%|███████████| 673/673 [00:00<00:00, 258684.62it/s]
Vocabulary: 100%|██████████████████████████| 673/673 [00:00<00:00, 93320.77it/s]


-------- Processing of dataframe ---------


Tokens: 100%|███████████████████████████| 1068/1068 [00:00<00:00, 250126.57it/s]
Lemmata: 100%|██████████████████████████| 1068/1068 [00:00<00:00, 210701.63it/s]
Tokens filtered: 100%|███████████████████| 1068/1068 [00:00<00:00, 97344.82it/s]
Lemmata filtered: 100%|██████████████████| 1068/1068 [00:00<00:00, 81175.66it/s]
Bigrams: 100%|██████████████████████████| 1068/1068 [00:00<00:00, 440593.75it/s]
Trigrams: 100%|█████████████████████████| 1068/1068 [00:00<00:00, 179403.13it/s]
tfidf: 100%|███████████████████████████████| 1068/1068 [00:07<00:00, 135.89it/s]
Part-of-Speech: 100%|███████████████████| 1068/1068 [00:00<00:00, 179101.86it/s]
Morphosyntactic features: 100%|█████████| 1068/1068 [00:00<00:00, 260149.64it/s]
Vocabulary: 100%|████████████████████████| 1068/1068 [00:00<00:00, 50248.09it/s]


-------- Processing of dataframe ---------


Tokens: 100%|███████████████████████████| 1149/1149 [00:00<00:00, 241566.68it/s]
Lemmata: 100%|██████████████████████████| 1149/1149 [00:00<00:00, 187410.28it/s]
Tokens filtered: 100%|██████████████████| 1149/1149 [00:00<00:00, 106693.87it/s]
Lemmata filtered: 100%|██████████████████| 1149/1149 [00:00<00:00, 82653.12it/s]
Bigrams: 100%|██████████████████████████| 1149/1149 [00:00<00:00, 439112.10it/s]
Trigrams: 100%|█████████████████████████| 1149/1149 [00:00<00:00, 404571.47it/s]
tfidf: 100%|███████████████████████████████| 1149/1149 [00:09<00:00, 127.61it/s]
Part-of-Speech: 100%|███████████████████| 1149/1149 [00:00<00:00, 160411.92it/s]
Morphosyntactic features: 100%|█████████| 1149/1149 [00:00<00:00, 269910.69it/s]
Vocabulary: 100%|████████████████████████| 1149/1149 [00:00<00:00, 35173.45it/s]


-------- Processing of dataframe ---------


Tokens: 100%|█████████████████████████████| 878/878 [00:00<00:00, 121417.70it/s]
Lemmata: 100%|█████████████████████████████| 878/878 [00:00<00:00, 88424.11it/s]
Tokens filtered: 100%|█████████████████████| 878/878 [00:00<00:00, 60288.44it/s]
Lemmata filtered: 100%|████████████████████| 878/878 [00:00<00:00, 76988.67it/s]
Bigrams: 100%|████████████████████████████| 878/878 [00:00<00:00, 409086.75it/s]
Trigrams: 100%|███████████████████████████| 878/878 [00:00<00:00, 168694.41it/s]
tfidf: 100%|█████████████████████████████████| 878/878 [00:04<00:00, 181.19it/s]
Part-of-Speech: 100%|█████████████████████| 878/878 [00:00<00:00, 230999.81it/s]
Morphosyntactic features: 100%|███████████| 878/878 [00:00<00:00, 235912.81it/s]
Vocabulary: 100%|██████████████████████████| 878/878 [00:00<00:00, 62458.22it/s]


-------- Processing of dataframe ---------


Tokens: 100%|███████████████████████████| 3768/3768 [00:00<00:00, 167009.80it/s]
Lemmata: 100%|██████████████████████████| 3768/3768 [00:00<00:00, 206247.63it/s]
Tokens filtered: 100%|██████████████████| 3768/3768 [00:00<00:00, 160839.99it/s]
Lemmata filtered: 100%|██████████████████| 3768/3768 [00:00<00:00, 83978.35it/s]
Bigrams: 100%|██████████████████████████| 3768/3768 [00:00<00:00, 455306.31it/s]
Trigrams: 100%|█████████████████████████| 3768/3768 [00:00<00:00, 379843.23it/s]
tfidf: 100%|████████████████████████████████| 3768/3768 [01:30<00:00, 41.64it/s]
Part-of-Speech: 100%|███████████████████| 3768/3768 [00:00<00:00, 203425.63it/s]
Morphosyntactic features: 100%|█████████| 3768/3768 [00:00<00:00, 243684.18it/s]
Vocabulary: 100%|████████████████████████| 3768/3768 [00:00<00:00, 97128.92it/s]


Unnamed: 0,verses,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,lemmata_tfidf,pos,morpho,vocabulary
3498,John 13:24,νεύει οὖν τούτωι σίμων πέτρος πυθέσθαι τίς ἂν ...,"[νεύει, οὖν, τούτωι, σίμων, πέτρος, πυθέσθαι, ...","[νεύω, οὖν, τούτωι, σίμων, πέτρος, πυνθάνομαι,...","[νεύει, τούτωι, σίμων, πέτρος, πυθέσθαι, τίς, ...","[νεύω, σιμόω, πέτρος, πυνθάνομαι, εἰμί, λέγω]","[(νεύω, σιμόω), (σιμόω, πέτρος), (πέτρος, πυνθ...","[(νεύω, σιμόω, πέτρος), (σιμόω, πέτρος, πυνθάν...","[0.17431684772421588, 0.1636486204038483, 0.53...","[verb, adverb, adjective, adjective, noun, ver...","[[(admirative, conditional, desiderative, impe...","[εἰμί, λέγω, νεύω, πέτρος, πυνθάνομαι, σιμόω]"
2665,Luke 20:40,οὐκέτι γὰρ ἐτόλμων ἐπερωτᾶν αὐτὸν οὐδέν,"[οὐκέτι, γὰρ, ἐτόλμων, ἐπερωτᾶν, αὐτὸν, οὐδέν]","[οὐκέτι, γάρ, τολμάω, ἐπερωτάω, αὐτός, οὐδείς]","[οὐκέτι, ἐτόλμων, ἐπερωτᾶν, αὐτὸν, οὐδέν]","[οὐκέτι, τολμάω, ἐπερωτάω, αὐτός, οὐδείς]","[(οὐκέτι, τολμάω), (τολμάω, ἐπερωτάω), (ἐπερωτ...","[(οὐκέτι, τολμάω, ἐπερωτάω), (τολμάω, ἐπερωτάω...","[0.13666993399265762, 0.4031742542355808, 0.43...","[adverb, adverb, verb, verb, pronoun, determiner]","[[(pos, neg)], [], [(habitual, imperfective, i...","[αὐτός, οὐδείς, οὐκέτι, τολμάω, ἐπερωτάω]"
899,Matt 25:39,πότε δέ σε εἴδομεν ἀσθενοῦντα ἢ ἐν φυλακῆι καὶ...,"[πότε, δέ, σε, εἴδομεν, ἀσθενοῦντα, ἢ, ἐν, φυλ...","[πότε, δέ, σύ, ὁράω, ἀσθενέω, ἤ, ἐν, φυλακῆι, ...","[πότε, δέ, σε, εἴδομεν, ἀσθενοῦντα, φυλακῆι, ἤ...","[ποτέ, εἶδον, ἀσθενέω, φυλακή, ἔρχομαι]","[(ποτέ, εἶδον), (εἶδον, ἀσθενέω), (ἀσθενέω, φυ...","[(ποτέ, εἶδον, ἀσθενέω), (εἶδον, ἀσθενέω, φυλα...","[0.3165116656176926, 0.2630305175056568, 0.502...","[adverb, adverb, pronoun, verb, verb, coordina...","[[(article, contrastive, demonstrative, emphat...","[εἶδον, ποτέ, φυλακή, ἀσθενέω, ἔρχομαι]"
83,Matt 4:19,καὶ λέγει αὐτοῖς δεῦτε ὀπίσω μου καὶ ποιήσω ὑμ...,"[καὶ, λέγει, αὐτοῖς, δεῦτε, ὀπίσω, μου, καὶ, π...","[καί, λέγω, αὐτός, δεῦτε, ὀπίσω, ἐγώ, καί, ποι...","[λέγει, αὐτοῖς, δεῦτε, ὀπίσω, μου, ποιήσω, ὑμᾶ...","[λέγω, αὐτός, δεῦτε, ὀπίσω, ποιέω, ἁλίζω1, ἄνθ...","[(λέγω, αὐτός), (αὐτός, δεῦτε), (δεῦτε, ὀπίσω)...","[(λέγω, αὐτός, δεῦτε), (αὐτός, δεῦτε, ὀπίσω), ...","[0.12944004319112637, 0.1887926628999442, 0.27...","[coordinating_conjunction, verb, pronoun, inte...","[[], [(admirative, conditional, desiderative, ...","[αὐτός, δεῦτε, λέγω, ποιέω, ἁλίζω1, ἄνθρωπος, ..."
2202,Luke 9:54,ἰδόντες δὲ οἱ μαθηταὶ ἰάκωβος καὶ ἰωάννης εἶπα...,"[ἰδόντες, δὲ, οἱ, μαθηταὶ, ἰάκωβος, καὶ, ἰωάνν...","[ὁράω, δέ, ὁ, μαθητής, ἰάκωβος, καί, ἰωάν(ν)ης...","[ἰδόντες, μαθηταὶ, ἰάκωβος, ἰωάννης, εἶπαν, κύ...","[εἶδον, μαθηταὶ, ἰάκωβος, ιὤαννης, εἶπον, κύρι...","[(εἶδον, μαθηταὶ), (μαθηταὶ, ἰάκωβος), (ἰάκωβο...","[(εἶδον, μαθηταὶ, ἰάκωβος), (μαθηταὶ, ἰάκωβος,...","[0.09295145692455532, 0.2511570339637543, 0.21...","[verb, adverb, determiner, noun, noun, coordin...","[[(habitual, imperfective, iterative, perfecti...","[αὐτός, εἶδον, εἶπον, ιὤαννης, καταβαίνω, κύρι..."


# Edit distance test on 2 verses

In [16]:
# Edit distance test on verses
id_verse_1 = "Mark 8:31"
# id_verse_1 = "Matt 16:21"
id_verse_2 = "Luke 9:22"

def extract_verse(id_verse,df):   
    extracted_verse = df[df.verses == id_verse].reset_index(drop=True)
    display(extracted_verse)
    return extracted_verse

# Find the verses in dataframes
v_1 = extract_verse(id_verse_1,Evangiles)
v_2 = extract_verse(id_verse_2,Evangiles)

def compute_distance(v_1,v_2,method,info=False,transpos=False):
    str_v1 = v_1[method][0]
    len_v1 = len(str_v1)
    print(len_v1)
    str_v2 = v_2[method][0]
    len_v2 = len(str_v2)
    print(len_v2)
   
    # Compute edit distance (characters)
    edit_dist = (nl.edit_distance(str_v1, str_v2,transpositions=transpos)-abs(len_v1-len_v2))/min(len_v1,len_v2)  # Normalized distance 1
    # edit_dist = (nl.edit_distance(str_v1, str_v2,transpositions=transpos))/np.max([len_v1,len_v2])  # Normalized distance  2  
    # edit_dist = (nl.edit_distance(str_v1, str_v2)-abs(len_v1-len_v2))/min(len_v1,len_v2)  # Raw distance (unity : words/characters)

    if info == True:
        print("\n-----",method," -----\n")
        print(str_v1)
        print(str_v2)
        print(f"Edit distance between 2 verses :",edit_dist)
    return edit_dist

compute_distance(v_1,v_2,'text',True)
compute_distance(v_1,v_2,'tokens',True,True)
compute_distance(v_1,v_2,'lemmata',True,True)
compute_distance(v_1,v_2,'tokens_filtered',True,True)
compute_distance(v_1,v_2,'lemmata_filtered',True,True)
compute_distance(v_1,v_2,'pos',True,True)
compute_distance(v_1,v_2,'morpho',True,True)
compute_distance(v_1,v_2,'vocabulary',True,True)

Unnamed: 0,verses,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,lemmata_tfidf,pos,morpho,vocabulary
0,Mark 8:31,καὶ ἤρξατο διδάσκειν αὐτοὺς ὅτι δεῖ τὸν υἱὸν τ...,"[καὶ, ἤρξατο, διδάσκειν, αὐτοὺς, ὅτι, δεῖ, τὸν...","[καί, ἄρχω, διδάσκω, αὐτός, ὅτι, δεῖ, ὁ, υἱός,...","[ἤρξατο, διδάσκειν, αὐτοὺς, δεῖ, υἱὸν, ἀνθρώπο...","[ἄρχω, διδάσκω, αὐτός, υἱός, ἄνθρωπος, πολύς, ...","[(ἄρχω, διδάσκω), (διδάσκω, αὐτός), (αὐτός, υἱ...","[(ἄρχω, διδάσκω, αὐτός), (διδάσκω, αὐτός, υἱός...","[0.19473592611178892, 0.08473834811934751, 0.2...","[coordinating_conjunction, verb, verb, pronoun...","[[], [(habitual, imperfective, iterative, perf...","[αὐτός, γραμματεύς, διδάσκω, πάσχω, πολύς, πρέ..."


Unnamed: 0,verses,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,lemmata_tfidf,pos,morpho,vocabulary
0,Luke 9:22,εἰπὼν ὅτι δεῖ τὸν υἱὸν τοῦ ἀνθρώπου πολλὰ παθε...,"[εἰπὼν, ὅτι, δεῖ, τὸν, υἱὸν, τοῦ, ἀνθρώπου, πο...","[λέγω, ὅτι, δεῖ, ὁ, υἱός, ὁ, ἄνθρωπος, πολύς, ...","[εἰπὼν, δεῖ, υἱὸν, ἀνθρώπου, πολλὰ, παθεῖν, ἀπ...","[εἶπον, υἱός, ἄνθρωπος, πολύς, πάσχω, ἀποδοκιμ...","[(εἶπον, υἱός), (υἱός, ἄνθρωπος), (ἄνθρωπος, π...","[(εἶπον, υἱός, ἄνθρωπος), (υἱός, ἄνθρωπος, πολ...","[0.20768776823374044, 0.21975362111698, 0.2688...","[verb, subordinating_conjunction, verb, determ...","[[(habitual, imperfective, iterative, perfecti...","[γραμματεύς, εἶπον, πάσχω, πολύς, πρέσβυς, τρί..."


196
166

----- text  -----

καὶ ἤρξατο διδάσκειν αὐτοὺς ὅτι δεῖ τὸν υἱὸν τοῦ ἀνθρώπου πολλὰ παθεῖν καὶ ἀποδοκιμασθῆναι ὑπὸ τῶν πρεσβυτέρων καὶ τῶν ἀρχιερέων καὶ τῶν γραμματέων καὶ ἀποκτανθῆναι καὶ μετὰ τρεῖς ἡμέρας ἀναστῆναι
εἰπὼν ὅτι δεῖ τὸν υἱὸν τοῦ ἀνθρώπου πολλὰ παθεῖν καὶ ἀποδοκιμασθῆναι ἀπὸ τῶν πρεσβυτέρων καὶ ἀρχιερέων καὶ γραμματέων καὶ ἀποκτανθῆναι καὶ τῆι τρίτηι ἡμέραι ἐγερθῆναι
Edit distance between 2 verses : 0.1144578313253012
30
25

----- tokens  -----

['καὶ', 'ἤρξατο', 'διδάσκειν', 'αὐτοὺς', 'ὅτι', 'δεῖ', 'τὸν', 'υἱὸν', 'τοῦ', 'ἀνθρώπου', 'πολλὰ', 'παθεῖν', 'καὶ', 'ἀποδοκιμασθῆναι', 'ὑπὸ', 'τῶν', 'πρεσβυτέρων', 'καὶ', 'τῶν', 'ἀρχιερέων', 'καὶ', 'τῶν', 'γραμματέων', 'καὶ', 'ἀποκτανθῆναι', 'καὶ', 'μετὰ', 'τρεῖς', 'ἡμέρας', 'ἀναστῆναι']
['εἰπὼν', 'ὅτι', 'δεῖ', 'τὸν', 'υἱὸν', 'τοῦ', 'ἀνθρώπου', 'πολλὰ', 'παθεῖν', 'καὶ', 'ἀποδοκιμασθῆναι', 'ἀπὸ', 'τῶν', 'πρεσβυτέρων', 'καὶ', 'ἀρχιερέων', 'καὶ', 'γραμματέων', 'καὶ', 'ἀποκτανθῆναι', 'καὶ', 'τῆι', 'τρίτηι', 'ἡμέραι', 'ἐγερθῆναι

0.23076923076923078

# Save dataframes

In [22]:
import pickle
with open("data/Evangiles.pkl", "wb") as file:
    pickle.dump(Evangiles, file, protocol=pickle.HIGHEST_PROTOCOL)
with open("data/Mark.pkl", "wb") as file:
    pickle.dump(Mark, file, protocol=pickle.HIGHEST_PROTOCOL)
with open("data/Matt.pkl", "wb") as file:
    pickle.dump(Matt, file, protocol=pickle.HIGHEST_PROTOCOL)
with open("data/Luke.pkl", "wb") as file:
    pickle.dump(Luke, file, protocol=pickle.HIGHEST_PROTOCOL)
with open("data/John.pkl", "wb") as file:
    pickle.dump(John, file, protocol=pickle.HIGHEST_PROTOCOL)

# Distance matrix computation

In [10]:
test = Mark[Mark.index == 0].reset_index(drop=True)
test['tokens'][0]

['ἀρχὴ', 'τοῦ', 'εὐαγγελίου', 'ἰησοῦ', 'χριστοῦ', '.']

In [13]:
def distance_matrix(df,method):
    len_df = df.shape[0]
    dist_mat = np.ones((len_df,len_df))
    for i in tqdm(range(len_df)):
        for j in range(i,len_df):
            v1 = df[df.index == i].reset_index(drop=True)
            v2 = df[df.index == j].reset_index(drop=True)
            dist_mat[i,j] = compute_distance(v1,v2,method)
    return dist_mat

# dist_mat = distance_matrix(Mark[Mark.index < 700],"lemmata_filtered")
dist_mat_Mc_lemmata_filtered = distance_matrix(Mark,"lemmata_filtered")
dist_mat_Mc_tokens_filtered = distance_matrix(Mark,"tokens_filtered")
dist_mat_Mc_text = distance_matrix(Mark,"text")

100%|█████████████████████████████████████████| 673/673 [01:52<00:00,  5.99it/s]


In [17]:
%matplotlib widget
ax = sns.heatmap(dist_mat[dist_mat<0.4], linewidth=0)
plt.show()

IndexError: Inconsistent shape between the condition and the input (got (704, 1) and (704,))

In [123]:
print(Mark.lemmata_filtered[39])
print(Mark.lemmata_filtered[40])

['ἔρχομαι', 'αὐτός', 'λεπρὸς', 'παρακαλέω', 'αὐτός', 'γονυπετέω', 'λέγω', 'αὐτός', 'ἐθέλω', 'δύναμαι', 'ἐγώ', 'καθαρίζω']
['ὀργίζω', 'ἐκτείνω', 'χείρ', 'αὐτός', 'ἅπτω', 'λέγω', 'αὐτός', 'ἐθέλω', 'καθαρίζω']


In [None]:
import pickle
with open("Distance_matrices/Distance_matrix"+, "wb") as pickle_file:
    pickle.dump(
        reduced_model,
        pickle_file,
        protocol=pickle.HIGHEST_PROTOCOL,
    )