In [1]:
"""
Created on Auguste 2024
@author: Théotime de la Selle
"""
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import glob
import os

import nltk as nl
import pylcs

import cltk
from cltk import NLP



In [2]:
# dirpath = "data/SBLGNT/"
dirpath = "data/LXX-Rahlf-1935/"

# Load and verify data

In [3]:
# load (reading csv file) and format data
def load_bible_texts(path):
    df = pd.read_csv(path,delimiter='\t', skiprows=(0),dtype=str,header=1)
    df_rows = df.shape[0]
    return df

# def unaccented_text(pd):
#     pd_unaccented = pd
#     for i in tqdm(range(pd.shape[0])):
#         pd_unaccented.text[i] = strip_accents(pd.text[i])
#     return pd_unaccented

# For loading multiple files 
# filepaths = list(filter(os.path.isfile, glob.glob(dirpath+"*.txt")))
# filepaths.sort()

# For loading one file
filepaths = [dirpath+"LXX.txt"]

In [4]:
df = [load_bible_texts(filepaths[0])]
for filepath in tqdm(filepaths[1:]):
    df.append(load_bible_texts(filepath))

NT = pd.concat(df)
NT.reset_index(drop=True,inplace=True)
NT_verses = NT.shape[0]

display(NT.sample(5))
print(NT_verses)

0it [00:00, ?it/s]


Unnamed: 0,verse,text
6009,「JoshB 9:26」,καὶ ἐποίησαν αὐτοῖς οὕτως καὶ ἐξείλατο αὐτοὺς ...
19656,「Od 5:16」,κύριε ἐν θλίψει ἐμνήσθην σου ἐν θλίψει μικρᾷ ἡ...
8951,「2Sam/K 13:7」,καὶ ἀπέστειλεν Δαυιδ πρὸς Θημαρ εἰς τὸν οἶκον ...
1850,「Exod 12:36」,καὶ κύριος ἔδωκεν τὴν χάριν τῷ λαῷ αὐτοῦ ἐναντ...
15628,「1Mac 13:38」,καὶ ὅσα ἐστήσαμεν πρὸς ὑμᾶς ἕστηκεν καὶ τὰ ὀχυ...


30637


# Data standardization

In [5]:
def standardize(df):
    
    # Normalize greek text and remove critical apparatus characters
    from cltk.alphabet.grc import normalize_grc
    from cltk.alphabet.grc import tonos_oxia_converter
    from cltk.alphabet.grc import drop_critical_apparatus_char
    from cltk.alphabet.grc import filter_non_greek
    from cltk.alphabet.grc import expand_iota_subscript
    # df.text = [expand_iota_subscript(txt) for txt in df.text]
    df.text = [drop_critical_apparatus_char(txt) for txt in df.text]
    df.text = [filter_non_greek(txt) for txt in df.text]
    # df.text = [tonos_oxia_converter(txt) for txt in df.text]
    df.text = [normalize_grc(txt) for txt in df.text]

    # Manually remove critical apparatus symbols
    # crit_symbols_list = ['⸀', '⸂','⸃','⟧','⟦'] # list of symbols
    # df.text = df.text.replace(crit_symbols_list,'',regex=True)
    
    # Lower case
    df.text = df.text.str.lower()
    return df

NT = standardize(NT)

# Control
NT.text.sample(5)

2307     καὶ καθυφανεῖς ἐν αὐτῷ ὕφασμα κατάλιθον τετράσ...
1373     εἶπεν δὲ φαραω πρὸς ιωσηφ εἰπὸν τοῖς ἀδελφοῖς ...
18541    οὐ φοβηθήσῃ ἀπὸ φόβου νυκτερινοῦ ἀπὸ βέλους πε...
30371    τότε δαρεῖος ὁ βασιλεὺς ἔγραψεν πᾶσι τοῖς λαοῖ...
10839    καὶ ἐβασίλευσεν φαραω νεχαω ἐπ ̓ αὐτοὺς τὸν ελ...
Name: text, dtype: object

# Cltk pipeline application

In [6]:
cltk_nlp_grc = NLP(language="grc")

‎𐤀 CLTK version '1.3.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekSpacyProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.

⸖ ``GreekSpacyProcess`` using OdyCy model by Center for Humanities Computing Aarhus from https://huggingface.co/chcaa . Please cite: https://aclanthology.org/2023.latechclfl-1.14
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [7]:
NT_doc = [cltk_nlp_grc.analyze(text=NT.text[i]) for i in tqdm(range(NT_verses),desc="NT pipeline application")]

NT pipeline application: 100%|██████████| 30637/30637 [05:10<00:00, 98.80it/s] 


# Dataframe pre-processing from Cltk doc

In [8]:
# Definition of the additionnal stop words list (especially for lemmata)
added_stop_words = ['δεῖ','ὧδε','ἐγώ','ἕως','ἀλλ','ἐάν','ἕξ','κατά','καί','αὐτός','μετά','αὐτὸν', 'εὐθύς','σύ', "τότε","πᾶσα","πᾶς","ἵνα","ὅς","τίς","τις","ἀπό","μή",'τῶι','ὑπ',"πῶς","ὅταν",'ἐπί',"δ","εἷς","οὗτος","πρός","πρὸς","πρό","οὖς","ὅτε","γάρ","δέ","πᾶς"]
# + 'εἶμι' + 'εὖ' + 'ποῦ' ?

In [9]:
def text_dataframe_processing(df,doc):

    print("-------- Processing of dataframe ---------")  

    # ----- Remove stop words and punctuation for adding filtered tokens and lemmata to dataframe
    from cltk.stops.words import Stops
    from cltk.text.processes import DefaultPunctuationRemovalProcess
    from cltk.lemmatize.grc import GreekBackoffLemmatizer
    
    Punct_filter = DefaultPunctuationRemovalProcess(language='grc')
    doc = [Punct_filter.run(d) for d in doc]
    
    # ----- Add tokens lemmata, and tokens_stops_filtered in dataframe
    df['tokens'] = [d.tokens for d in tqdm(doc,desc="Tokens")]
    df['lemmata'] = [d.lemmata for d in tqdm(doc,desc="Lemmata")]
    df['tokens_filtered'] = [d.tokens_stops_filtered for d in tqdm(doc,desc="Tokens filtered")]
     
    lemmatizer = GreekBackoffLemmatizer() # we have to lemmatize the filtered tokens
    tokens_lemmata_filtered = [lemmatizer.lemmatize(lem) for lem in df.tokens_filtered]
    lemmata_filtered = []
    for lem in tokens_lemmata_filtered :    
        lemmata_filtered.append([l[1] for l in lem])

    # Removing stop words from lemmata (based on cltk.stops.words process but only through extra_stops additionnal list as remove_stopwords doesn't work for lemmata)
    stops_obj = Stops(iso_code="grc")
    df['lemmata_filtered'] = [stops_obj.remove_stopwords(tokens=lem, extra_stops=added_stop_words) for lem in tqdm(lemmata_filtered,desc="Lemmata filtered")]
    
    #----- Add n-grams
    df['bigrams'] = [list(nl.bigrams(lem)) for lem in tqdm(df.lemmata_filtered,desc="Bigrams")]
    df['trigrams'] = [list(nl.trigrams(lem)) for lem in tqdm(df.lemmata_filtered,desc="Trigrams")]

    # ----- Add tf-idf score for each lemmata
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer(
        analyzer='word',
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None)  

    # Learn vocabulary and idf, return document-term matrix. 
    # doc_term_matrix = tfidf.fit_transform(df.lemmata_filtered)
    # tfidf_values = [doc_term_matrix[i,j] for i, j in zip(*doc_term_matrix.nonzero())]
    # id_tfidf = [i for i, j in zip(*doc_term_matrix.nonzero())]
    # tfidf_verses = []
    # for i in tqdm(range(df.shape[0]),desc="tfidf"):
    #     tfidf_verses.append([tfidf_values[index] for (index, item) in enumerate(id_tfidf) if item == i])
    # df['lemmata_tfidf'] = [tf for tf in tfidf_verses]

    # ---- Add part-of-speech feature in dataframe
    pos = []
    for i in tqdm(range(df.shape[0]),desc="Part-of-Speech"):
        pos.append([str(word.pos) for word in doc[i].words])   
    df['pos'] = pos

    # ---- Add morphosyntactic features in dataframe
    df['morpho'] = [d.morphosyntactic_features for d in tqdm(doc,desc="Morphosyntactic features")]    

    # ----- Add vocabulary feature in dataframe
    from nltk.lm import Vocabulary
    vocab = []
    for i in tqdm(range(df.shape[0]),desc="Vocabulary"):
        vocab.append(sorted(Vocabulary(df.lemmata_filtered[i], unk_cutoff=1).counts))
    
    df['vocabulary'] = vocab
    
    return df


Evangiles = text_dataframe_processing(NT,NT_doc)

display(NT.sample(5)) # To controle pre-processing

-------- Processing of dataframe ---------


Tokens: 100%|██████████| 30637/30637 [00:00<00:00, 150286.23it/s]
Lemmata: 100%|██████████| 30637/30637 [00:00<00:00, 183023.63it/s]
Tokens filtered: 100%|██████████| 30637/30637 [00:00<00:00, 114724.13it/s]
Lemmata filtered: 100%|██████████| 30637/30637 [00:00<00:00, 43888.46it/s]
Bigrams: 100%|██████████| 30637/30637 [00:00<00:00, 244886.28it/s]
Trigrams: 100%|██████████| 30637/30637 [00:00<00:00, 235790.54it/s]
Part-of-Speech: 100%|██████████| 30637/30637 [00:00<00:00, 135775.76it/s]
Morphosyntactic features: 100%|██████████| 30637/30637 [00:00<00:00, 203771.40it/s]
Vocabulary: 100%|██████████| 30637/30637 [00:00<00:00, 50146.32it/s]


Unnamed: 0,verse,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,pos,morpho,vocabulary
21148,「Job 2:6」,εἶπεν δὲ ὁ κύριος τῷ διαβόλῳ ἰδοὺ παραδίδωμί σ...,"[εἶπεν, δὲ, ὁ, κύριος, τῷ, διαβόλῳ, ἰδοὺ, παρα...","[λέγω, δέ, ὁ, κύριος, ὁ, διάβολος, ἰδοὺ, παραδ...","[εἶπεν, κύριος, διαβόλῳ, ἰδοὺ, παραδίδωμί, σοι...","[εἶπον, κύριος, διάβολος, ἰδού, παραδίδωμί, αὐ...","[(εἶπον, κύριος), (κύριος, διάβολος), (διάβολο...","[(εἶπον, κύριος, διάβολος), (κύριος, διάβολος,...","[verb, adverb, determiner, noun, determiner, n...","[[(habitual, imperfective, iterative, perfecti...","[αὐτός, διάβολος, διαφυλάσσω, εἶπον, κύριος, μ..."
11198,「1Chr 7:35」,καὶ βανηελαμ ἀδελφοὶ αὐτοῦ σωφα καὶ ιμανα καὶ ...,"[καὶ, βανηελαμ, ἀδελφοὶ, αὐτοῦ, σωφα, καὶ, ιμα...","[καί, βανηελαμ, ἀδελφός, αὐτός, σωφος, καί, ιμ...","[βανηελαμ, ἀδελφοὶ, αὐτοῦ, σωφα, ιμανα, σελλης...","[βανηελαμ, ἀδελφός, αὐτός, σωφα, ιμανα, σελλης...","[(βανηελαμ, ἀδελφός), (ἀδελφός, αὐτός), (αὐτός...","[(βανηελαμ, ἀδελφός, αὐτός), (ἀδελφός, αὐτός, ...","[coordinating_conjunction, noun, noun, pronoun...","[[], [(nominative, accusative, ergative, absol...","[αμαλ, αὐτός, βανηελαμ, ιμανα, σελλης, σωφα, ἀ..."
3352,「Lev 22:33」,ὁ ἐξαγαγὼν ὑμᾶς ἐκ γῆς αἰγύπτου ὥστε εἶναι ὑμῶ...,"[ὁ, ἐξαγαγὼν, ὑμᾶς, ἐκ, γῆς, αἰγύπτου, ὥστε, ε...","[ὁ, ἐξάγω, ὑμεῖς, ἐκ, γῆ, αἴγυπτος, ὥστε, εἰμί...","[ἐξαγαγὼν, ὑμᾶς, γῆς, αἰγύπτου, εἶναι, ὑμῶν, θ...","[ἐξάγω, γῆ, αἴγυπτος, εἰμί, θεός, κύριος]","[(ἐξάγω, γῆ), (γῆ, αἴγυπτος), (αἴγυπτος, εἰμί)...","[(ἐξάγω, γῆ, αἴγυπτος), (γῆ, αἴγυπτος, εἰμί), ...","[determiner, verb, pronoun, adposition, noun, ...","[[(nominative, accusative, ergative, absolutiv...","[αἴγυπτος, γῆ, εἰμί, θεός, κύριος, ἐξάγω]"
9325,「2Sam/K 24:6」,καὶ ἦλθον εἰς τὴν γαλααδ καὶ εἰς γῆν θαβασων ἥ...,"[καὶ, ἦλθον, εἰς, τὴν, γαλααδ, καὶ, εἰς, γῆν, ...","[καί, ἔρχομαι, εἰς, ὁ, γαλααδ, καί, εἰς, γῆ, θ...","[ἦλθον, γαλααδ, γῆν, θαβασων, ἐστιν, αδασαι, π...","[ἔρχομαι, γαλααδ, γῆ, θαβασων, εἰμί, αδασαι, π...","[(ἔρχομαι, γαλααδ), (γαλααδ, γῆ), (γῆ, θαβασων...","[(ἔρχομαι, γαλααδ, γῆ), (γαλααδ, γῆ, θαβασων),...","[coordinating_conjunction, verb, adposition, d...","[[], [(habitual, imperfective, iterative, perf...","[αδασαι, γαλααδ, γῆ, δανιδαν, εἰμί, θαβασων, κ..."
21434,「Job 13:25」,ἦ ὡς φύλλον κινούμενον ὑπὸ ἀνέμου εὐλαβηθήσῃ ἢ...,"[ἦ, ὡς, φύλλον, κινούμενον, ὑπὸ, ἀνέμου, εὐλαβ...","[ἦ, ὡς, φύλλος, κινέω, ὑπό, ἄνεμος, εὐλαβηθέω,...","[ἦ, φύλλον, κινούμενον, ἀνέμου, εὐλαβηθήσῃ, χό...","[εἰμί, φύλλον, κινέω, ἄνεμος, εὐλαβέομαι, χόρ...","[(εἰμί, φύλλον), (φύλλον, κινέω), (κινέω, ἄνεμ...","[(εἰμί, φύλλον, κινέω), (φύλλον, κινέω, ἄνεμος...","[adverb, adverb, adjective, verb, adposition, ...","[[], [], [(nominative, accusative, ergative, a...","[εὐλαβέομαι, εἰμί, κινέω, πνεῦμα, φέρω, φύλλο..."


# Metric test on 2 verses

In [10]:
from nltk.metrics.distance import jaro_similarity

In [11]:
# Edit distance test on verses
id_verse_1 = "「JudgA 13:1」"
# id_verse_2 = "Matt 16:21"
id_verse_2 = "「Ps 131:18」"

def extract_verse(id_verse,df):   
    extracted_verse = df[df.verse == id_verse].reset_index(drop=True)
    display(extracted_verse)
    return extracted_verse

# Find the verses in dataframes
v_1 = extract_verse(id_verse_1,NT)
v_2 = extract_verse(id_verse_2,NT)

def compute_distance(v_1,v_2,method,info=False,transpos=False):
    str_v1 = v_1[method][0]
    len_v1 = len(str_v1)
    str_v2 = v_2[method][0]
    len_v2 = len(str_v2)

    transpos = False
    # Compute distance
    # edit_dist = (nl.edit_distance(str_v1, str_v2,substitution_cost=1,transpositions=transpos)-abs(len_v1-len_v2))/min(len_v1,len_v2)  # Normalized distance 1
    # edit_dist = jaro_similarity(str_v1, str_v2)  # Normalized distance 1
    # edit_dist = 1-(pylcs.lcs_sequence_length(str_v1,str_v2)-abs(len_v1-len_v2))/min(len_v1,len_v2)
    edit_dist = (nl.edit_distance(str_v1, str_v2,substitution_cost=1,transpositions=transpos))/np.max([len_v1,len_v2])  # Normalized distance  2  
    # edit_dist = (nl.edit_distance(str_v1, str_v2)-abs(len_v1-len_v2))/min(len_v1,len_v2)  # Raw distance (unity : words/characters)

    if info == True:
        print("\n-----",method," -----\n")
        print(str_v1)
        print(str_v2)
        print(f"Edit distance between 2 verses :",edit_dist)
    return edit_dist

compute_distance(v_1,v_2,'text',True)
%time compute_distance(v_1,v_2,'tokens',True,True)
compute_distance(v_1,v_2,'lemmata',True,True)
compute_distance(v_1,v_2,'tokens_filtered',True,True)
compute_distance(v_1,v_2,'lemmata_filtered',True,True)
compute_distance(v_1,v_2,'pos',True,True)
compute_distance(v_1,v_2,'morpho',True,True)
compute_distance(v_1,v_2,'vocabulary',True,True)

Unnamed: 0,verse,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,pos,morpho,vocabulary
0,「JudgA 13:1」,καὶ προσέθεντο οἱ υἱοὶ ισραηλ ποιῆσαι τὸ πονηρ...,"[καὶ, προσέθεντο, οἱ, υἱοὶ, ισραηλ, ποιῆσαι, τ...","[καί, προστίθημι, ὁ, υἱός, ισραηλ, ποιέω, ὁ, π...","[προσέθεντο, υἱοὶ, ισραηλ, ποιῆσαι, πονηρὸν, ἐ...","[προστίθημι, υἱοὶ, ισραηλ, ποιέω, πονηρός, ἐνα...","[(προστίθημι, υἱοὶ), (υἱοὶ, ισραηλ), (ισραηλ, ...","[(προστίθημι, υἱοὶ, ισραηλ), (υἱοὶ, ισραηλ, πο...","[coordinating_conjunction, verb, determiner, n...","[[], [(habitual, imperfective, iterative, perf...","[αὐτός, ισραηλ, κύριος, παραδίδωμι, ποιέω, πον..."


Unnamed: 0,verse,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,pos,morpho,vocabulary
0,「Ps 131:18」,τοὺς ἐχθροὺς αὐτοῦ ἐνδύσω αἰσχύνην ἐπὶ δὲ αὐτὸ...,"[τοὺς, ἐχθροὺς, αὐτοῦ, ἐνδύσω, αἰσχύνην, ἐπὶ, ...","[ὁ, ἐχθρός, αὐτός, ἐνδύω, αἰσχύνη, ἐπί, δέ, αὐ...","[ἐχθροὺς, αὐτοῦ, ἐνδύσω, αἰσχύνην, αὐτὸν, ἐξαν...","[ἐχθρός, αὐτός, ἐνδύω, αἰσχύνη, αὐτός, ἐξανθέω...","[(ἐχθρός, αὐτός), (αὐτός, ἐνδύω), (ἐνδύω, αἰσχ...","[(ἐχθρός, αὐτός, ἐνδύω), (αὐτός, ἐνδύω, αἰσχύν...","[determiner, adjective, pronoun, verb, noun, a...","[[(nominative, accusative, ergative, absolutiv...","[αἰσχύνη, αὐτός, ἁγίασμά, ἐνδύω, ἐξανθέω, ἐχθρός]"



----- text  -----

καὶ προσέθεντο οἱ υἱοὶ ισραηλ ποιῆσαι τὸ πονηρὸν ἐναντίον κυρίου καὶ παρέδωκεν αὐτοὺς κύριος ἐν χειρὶ ἀλλοφύλων τεσσαράκοντα ἔτη
τοὺς ἐχθροὺς αὐτοῦ ἐνδύσω αἰσχύνην ἐπὶ δὲ αὐτὸν ἐξανθήσει τὸ ἁγίασμά μου
Edit distance between 2 verses : 0.796875

----- tokens  -----

['καὶ', 'προσέθεντο', 'οἱ', 'υἱοὶ', 'ισραηλ', 'ποιῆσαι', 'τὸ', 'πονηρὸν', 'ἐναντίον', 'κυρίου', 'καὶ', 'παρέδωκεν', 'αὐτοὺς', 'κύριος', 'ἐν', 'χειρὶ', 'ἀλλοφύλων', 'τεσσαράκοντα', 'ἔτη']
['τοὺς', 'ἐχθροὺς', 'αὐτοῦ', 'ἐνδύσω', 'αἰσχύνην', 'ἐπὶ', 'δὲ', 'αὐτὸν', 'ἐξανθήσει', 'τὸ', 'ἁγίασμά', 'μου']
Edit distance between 2 verses : 1.0
CPU times: user 745 μs, sys: 0 ns, total: 745 μs
Wall time: 754 μs

----- lemmata  -----

['καί', 'προστίθημι', 'ὁ', 'υἱός', 'ισραηλ', 'ποιέω', 'ὁ', 'πονηρός', 'ἐναντίον', 'κύριος', 'καί', 'παραδίδωμι', 'αὐτός', 'κύριος', 'ἐν', 'χείρ', 'ἀλλόφυλος', 'τεσσαράκοντα', 'ἔτος']
['ὁ', 'ἐχθρός', 'αὐτός', 'ἐνδύω', 'αἰσχύνη', 'ἐπί', 'δέ', 'αὐτός', 'ἐξανθέω', 'ὁ', 'ἁγίασμα', 'ἐγώ']
Edit d

1.0

In [12]:
id_verse_1 = "Mark 1:1"

v_1 = extract_verse(id_verse_1,Evangiles)
print(v_1['vocabulary'][0])

Unnamed: 0,verse,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,pos,morpho,vocabulary


KeyError: 0

# Save dataframes

In [None]:
# extract Ps verses from dataframe NT
Ps = NT[NT.verse.str.contains("Ps")].reset_index(drop=True) 
Ps = Ps[Ps.verse.str.contains("PsSol")==False].reset_index(drop=True) # Remove PsSol verses
display(Ps.sample(5))

Unnamed: 0,verse,text,tokens,lemmata,tokens_filtered,lemmata_filtered,bigrams,trigrams,pos,morpho,vocabulary
1709,「Ps 103:15」,καὶ οἶνος εὐφραίνει καρδίαν ἀνθρώπου τοῦ ἱλαρῦ...,"[καὶ, οἶνος, εὐφραίνει, καρδίαν, ἀνθρώπου, τοῦ...","[καί, οἶνος, εὐφραίνω, καρδία, ἄνθρωπος, ὁ, ἱλ...","[οἶνος, εὐφραίνει, καρδίαν, ἀνθρώπου, ἱλαρῦναι...","[οἶνος, εὐφραίνω, καρδία, ἄνθρωπος, ἱλαρύνω, ...","[(οἶνος, εὐφραίνω), (εὐφραίνω, καρδία), (καρ...","[(οἶνος, εὐφραίνω, καρδία), (εὐφραίνω, καρδί...","[coordinating_conjunction, noun, verb, noun, n...","[[], [(nominative, accusative, ergative, absol...","[εὐφραίνω, καρδία, οἶνος, πρόσωπον, στηρίζω, ..."
2445,「Ps 143:15」,ἐμακάρισαν τὸν λαόν ᾧ ταῦτά ἐστιν μακάριος ὁ λ...,"[ἐμακάρισαν, τὸν, λαόν, ᾧ, ταῦτά, ἐστιν, μακάρ...","[ἐμακάρισαν, ὁ, λαός, ὅς, οὗτος, εἰμί, μακάριο...","[ἐμακάρισαν, λαόν, ἐστιν, μακάριος, λαός, κύρι...","[μακαρίζω, λαός, εἰμί, μακάριος, λαός, κύριος,...","[(μακαρίζω, λαός), (λαός, εἰμί), (εἰμί, μακάρι...","[(μακαρίζω, λαός, εἰμί), (λαός, εἰμί, μακάριος...","[verb, determiner, noun, pronoun, adjective, a...","[[(habitual, imperfective, iterative, perfecti...","[αὐτός, εἰμί, θεός, κύριος, λαός, μακάριος, μα..."
1588,「Ps 95:1」,ὅτε ὁ οἶκος ᾠκοδομεῖτο μετὰ τὴν αἰχμαλωσίαν ᾠδ...,"[ὅτε, ὁ, οἶκος, ᾠκοδομεῖτο, μετὰ, τὴν, αἰχμαλω...","[ὅτε, ὁ, οἶκος, ᾠκοδομέομαι, μετά, ὁ, αἰχμαλωσ...","[ὅτε, οἶκος, ᾠκοδομεῖτο, αἰχμαλωσίαν, ᾠδὴ, δαυ...","[οἶκος, οἰκοδομέω, αἰχμαλωσία, ᾠδὴ, δαυιδ, ἀεί...","[(οἶκος, οἰκοδομέω), (οἰκοδομέω, αἰχμαλωσία), ...","[(οἶκος, οἰκοδομέω, αἰχμαλωσία), (οἰκοδομέω, α...","[subordinating_conjunction, determiner, noun, ...","[[], [(nominative, accusative, ergative, absol...","[αἰχμαλωσία, γῆ, δαυιδ, καινός, κύριος, οἰκοδο..."
381,「Ps 27:9」,σῶσον τὸν λαόν σου καὶ εὐλόγησον τὴν κληρονομί...,"[σῶσον, τὸν, λαόν, σου, καὶ, εὐλόγησον, τὴν, κ...","[σῴζω, ὁ, λαός, σύ, καί, εὐλόγησον, ὁ, κληρονο...","[σῶσον, λαόν, σου, εὐλόγησον, κληρονομίαν, σου...","[σώζω, λαός, εὐλογέω, κληρονομία, ποίμανον, α...","[(σώζω, λαός), (λαός, εὐλογέω), (εὐλογέω, κλ...","[(σώζω, λαός, εὐλογέω), (λαός, εὐλογέω, κληρ...","[verb, determiner, noun, pronoun, coordinating...","[[(habitual, imperfective, iterative, perfecti...","[αἰών, αὐτός, εὐλογέω, κληρονομία, λαός, ποίμ..."
1791,「Ps 105:17」,ἠνοίχθη ἡ γῆ καὶ κατέπιεν δαθαν καὶ ἐκάλυψεν ἐ...,"[ἠνοίχθη, ἡ, γῆ, καὶ, κατέπιεν, δαθαν, καὶ, ἐκ...","[ἀνοίγνυμι, ὁ, γῆ, καί, καταπίνω, δαθαν, καί, ...","[ἠνοίχθη, γῆ, κατέπιεν, δαθαν, ἐκάλυψεν, συναγ...","[ἀνοίγνυμι, γῆ, καταπίνω, δαθαν, καλύπτω, συνα...","[(ἀνοίγνυμι, γῆ), (γῆ, καταπίνω), (καταπίνω, δ...","[(ἀνοίγνυμι, γῆ, καταπίνω), (γῆ, καταπίνω, δαθ...","[verb, determiner, noun, coordinating_conjunct...","[[(habitual, imperfective, iterative, perfecti...","[αβιρων, γῆ, δαθαν, καλύπτω, καταπίνω, συναγωγ..."


In [18]:
with open("data/Ps.pkl", "wb") as file:
    pickle.dump(Ps, file, protocol=pickle.HIGHEST_PROTOCOL)

## CSV export

In [26]:
# compression_opts = dict(method='zip',archive_name='NT_lemmatized.csv')
NT_simple = NT[['verse','text','tokens','lemmata','pos']]

for i in range(NT_simple.shape[0]):
    lemmes = ''
    for j,l in enumerate(NT_simple.lemmata[i]):
        lemmes = lemmes+' '+l
    NT_simple.lemmata[i] = lemmes


NT_simple.to_csv('NT_lemmatized.csv', index=False)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

