# NT text preprocessing

In [3]:
import pandas as pd
import numpy as np  
import openpyxl
import lxml
import nltk

from io import StringIO
from lxml import etree
from tqdm import tqdm 

# NT text preprocessing

## Load NT file from Proiel 

In [None]:
# Function to clean xml:id attributes
def clean_xml_ids(xml_content):
    # Replace invalid characters in xml:id attributes
    cleaned_content = xml_content.replace('xml:id="RHE_Ps_147_12_c _', 'xml:id="RHE_Ps_147_12_c_')
    return cleaned_content

# Read the XML file content
with open('data/greek-nt.xml', 'r', encoding='utf-8') as file:
    xml_content = file.read()

# Clean the xml:id attributes
cleaned_xml_content = clean_xml_ids(xml_content)

# Parse the cleaned XML content
tree = etree.parse('data/greek-nt.xml')
root = tree.getroot()

In [5]:
import unicodedata

def remove_greek_accents(text):
    # Normalize the text to decompose characters into base and diacritics
    normalized_text = unicodedata.normalize('NFD', text)
    # Remove diacritical marks (accents)
    stripped_text = ''.join(char for char in normalized_text if not unicodedata.combining(char))
    # Re-normalize to NFC (composed form)
    return unicodedata.normalize('NFC', stripped_text)

In [5]:
# Build dataframe listed by verse, from TEI root

NT = pd.DataFrame(columns=['verse', 'token', 'lemma', 'pos'])

token_list = []
lemma_list = []
pos_list = []
verse = ''
for elem in root.iter():
    if elem.tag == 'token':
        if elem.get('citation-part') != verse:
            NT.loc[len(NT)] = [verse, token_list, lemma_list, pos_list]         
            token_list = []
            lemma_list = []
            pos_list = []
            verse = elem.get('citation-part')
        if elem.get('lemma') is not None:
            lemma_list.append(str(elem.get('lemma')))
            pos_list.append(str(elem.get('part-of-speech')))
            token_list.append(str(elem.get('form')))

# Delete first line of NT
NT = NT.drop(NT.index[0])

# drop all lines in NT where 'token' is empty
NT = NT[NT['token'].str.len() > 0]

# Group by 'verse' and aggregate 'token', 'lemma', and 'pos' into list but keep original order
import itertools
NT_grouped = NT.groupby('verse').agg({
    'token': lambda x: list(itertools.chain.from_iterable(x)),
    'lemma': lambda x: list(itertools.chain.from_iterable(x)),
    'pos': lambda x: list(itertools.chain.from_iterable(x))
}).reset_index()

# clean numbering in lemma
NT_grouped['lemma'] = NT_grouped['lemma'].apply(lambda x: [i.replace('#1','').replace('#2','').replace('#3','').replace('(','').replace(')','') for i in x])

# Reorder the grouped DataFrame by 'verse' according to the original order in NT
NT_grouped['verse'] = pd.Categorical(NT_grouped['verse'], categories=NT['verse'].unique(), ordered=True)
# Sort the grouped DataFrame by 'verse' to maintain the original order
NT = NT_grouped.sort_values('verse').reset_index(drop=True)

# Display the grouped DataFrame
display(NT)

Unnamed: 0,verse,token,lemma,pos
0,MATT 1.1,"[Βίβλος, γενέσεως, Ἰησοῦ, Χριστοῦ, υἱοῦ, Δαυεὶ...","[βίβλος, γένεσις, Ἰησοῦς, Χριστός, υἱός, Δαυίδ...","[Nb, Nb, Ne, Ne, Nb, Ne, Nb, Ne]"
1,MATT 1.2,"[Ἀβραὰμ, ἐγέννησεν, τὸν, Ἰσαάκ, Ἰσαὰκ, δὲ, ἐγέ...","[Ἀβραάμ, γεννάω, ὁ, Ἰσαάκ, Ἰσαάκ, δέ, γεννάω, ...","[Ne, V-, S-, Ne, Ne, Df, V-, S-, Ne, Ne, Df, V..."
2,MATT 1.3,"[Ἰούδας, δὲ, ἐγέννησεν, τὸν, Φάρες, καὶ, τὸν, ...","[Ἰούδας, δέ, γεννάω, ὁ, Φάρες, καί, ὁ, Ζάρα, ἐ...","[Ne, Df, V-, S-, Ne, C-, S-, Ne, R-, S-, Ne, N..."
3,MATT 1.4,"[Ἀρὰμ, δὲ, ἐγέννησεν, τὸν, Ἀμιναδάβ, Ἀμιναδὰβ,...","[Ἀράμ, δέ, γεννάω, ὁ, Ἀμιναδάβ, Ἀμιναδάβ, δέ, ...","[Ne, Df, V-, S-, Ne, Ne, Df, V-, S-, Ne, Ne, D..."
4,MATT 1.5,"[Σαλμὼν, δὲ, ἐγέννησεν, τὸν, Βόες, ἐκ, τῆς, Ῥα...","[Σαλμών, δέ, γεννάω, ὁ, Βόες, ἐκ, ὁ, Ῥαχάβ, Βό...","[Ne, Df, V-, S-, Ne, R-, S-, Ne, Ne, Df, V-, S..."
...,...,...,...,...
7644,REV 22.17,"[καὶ, τὸ, πνεῦμα, καὶ, ἡ, νύμφη, λέγουσιν, ἔρχ...","[καί, ὁ, πνεῦμα, καί, ὁ, νύμφη, λέγω, ἔρχομαι,...","[C-, S-, Nb, C-, S-, Nb, V-, V-, C-, S-, V-, V..."
7645,REV 22.18,"[μαρτυρῶ, ἐγὼ, παντὶ, τῷ, ἀκούοντι, τοὺς, λόγο...","[μαρτυρέω, ἐγώ, πᾶς, ὁ, ἀκούω, ὁ, λόγος, ὁ, πρ...","[V-, Pp, Px, S-, V-, S-, Nb, S-, Nb, S-, Nb, P..."
7646,REV 22.19,"[καὶ, ἐάν, τις, ἀφέλῃ, ἀπὸ, τῶν, λόγων, τοῦ, β...","[καί, ἐάν, τὶς, ἀφαιρέω, ἀπό, ὁ, λόγος, ὁ, βιβ...","[C-, G-, Px, V-, R-, S-, Nb, S-, Nb, S-, Nb, P..."
7647,REV 22.20,"[Λέγει, ὁ, μαρτυρῶν, ταῦτα, ναί, ἔρχομαι, ταχύ...","[λέγω, ὁ, μαρτυρέω, οὗτος, ναί, ἔρχομαι, ταχύς...","[V-, S-, V-, Pd, I-, V-, A-, I-, V-, Nb, Ne]"


### Loading and applying PoS converter table

In [None]:
# load the pos table
PoS_table = pd.read_csv('data/SC/Proiel_POS.csv',delimiter='\t', skiprows=(0),dtype=str,header=0)

# convert the pos table to a dictionary
pos_dict = PoS_table.set_index('POS_Proiel').T.to_dict('list')

# apply the pos_dict to the NT dataframe
NT['pos'] = NT['pos'].apply(lambda x: [pos_dict.get(i, [''])[0] for i in x])

display(NT.head(30))

Unnamed: 0,verse,token,lemma,pos
0,MATT 1.1,"[Βίβλος, γενέσεως, Ἰησοῦ, Χριστοῦ, υἱοῦ, Δαυεὶ...","[βίβλος, γένεσις, Ἰησοῦς, Χριστός, υἱός, Δαυίδ...","[NOUN, NOUN, PROPN, PROPN, NOUN, PROPN, NOUN, ..."
1,MATT 1.2,"[Ἀβραὰμ, ἐγέννησεν, τὸν, Ἰσαάκ, Ἰσαὰκ, δὲ, ἐγέ...","[Ἀβραάμ, γεννάω, ὁ, Ἰσαάκ, Ἰσαάκ, δέ, γεννάω, ...","[PROPN, VERB, DET, PROPN, PROPN, ADV, VERB, DE..."
2,MATT 1.3,"[Ἰούδας, δὲ, ἐγέννησεν, τὸν, Φάρες, καὶ, τὸν, ...","[Ἰούδας, δέ, γεννάω, ὁ, Φάρες, καί, ὁ, Ζάρα, ἐ...","[PROPN, ADV, VERB, DET, PROPN, CON, DET, PROPN..."
3,MATT 1.4,"[Ἀρὰμ, δὲ, ἐγέννησεν, τὸν, Ἀμιναδάβ, Ἀμιναδὰβ,...","[Ἀράμ, δέ, γεννάω, ὁ, Ἀμιναδάβ, Ἀμιναδάβ, δέ, ...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER..."
4,MATT 1.5,"[Σαλμὼν, δὲ, ἐγέννησεν, τὸν, Βόες, ἐκ, τῆς, Ῥα...","[Σαλμών, δέ, γεννάω, ὁ, Βόες, ἐκ, ὁ, Ῥαχάβ, Βό...","[PROPN, ADV, VERB, DET, PROPN, PREP, DET, PROP..."
5,MATT 1.6,"[Ἰεσσαὶ, δὲ, ἐγέννησεν, τὸν, Δαυεὶδ, τὸν, βασι...","[Ἰεσσαί, δέ, γεννάω, ὁ, Δαυίδ, ὁ, βασιλεύς, Δα...","[PROPN, ADV, VERB, DET, PROPN, DET, NOUN, PROP..."
6,MATT 1.7,"[Σολομὼν, δὲ, ἐγέννησεν, τὸν, Ῥοβοάμ, Ῥοβοὰμ, ...","[Σολομών, δέ, γεννάω, ὁ, Ῥοβοάμ, Ῥοβοάμ, δέ, γ...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER..."
7,MATT 1.8,"[Ἀσὰφ, δὲ, ἐγέννησεν, τὸν, Ἰωσαφάτ, Ἰωσαφὰτ, δ...","[Ἀσάφ, δέ, γεννάω, ὁ, Ἰωσαφάτ, Ἰωσαφάτ, δέ, γε...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER..."
8,MATT 1.9,"[Ὀζείας, δὲ, ἐγέννησεν, τὸν, Ἰωαθάμ, Ἰωαθὰμ, δ...","[Ὀζίας, δέ, γεννάω, ὁ, Ἰωαθάμ, Ἰωαθάμ, δέ, γεν...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER..."
9,MATT 1.10,"[Ἑζεκίας, δὲ, ἐγέννησεν, τὸν, Μανασσῆ, Μανασσῆ...","[Ἑζεκίας, δέ, γεννάω, ὁ, Μανασσῆς, Μανασσῆς, δ...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER..."


## Integrate lacking verses

In [None]:
NT_add = pd.read_csv('data/corrected_NT_verses.csv', delimiter='\t',header=0,dtype=str)

# delete all ',' in the columns 'token', 'lemma', and 'pos'
NT_add['token'] = NT_add['token'].str.replace("'", '')
NT_add['lemma'] = NT_add['lemma'].str.replace("'", '')
NT_add['pos'] = NT_add['pos'].str.replace("'", '')
NT_add['token'] = NT_add['token'].str.replace('[', '')
NT_add['lemma'] = NT_add['lemma'].str.replace('[', '')
NT_add['pos'] = NT_add['pos'].str.replace('[', '')
NT_add['token'] = NT_add['token'].str.replace(']', '')
NT_add['lemma'] = NT_add['lemma'].str.replace(']', '')
NT_add['pos'] = NT_add['pos'].str.replace(']', '')
NT_add['token'] = NT_add['token'].str.replace(" ", '')
NT_add['lemma'] = NT_add['lemma'].str.replace(" ", '')
NT_add['pos'] = NT_add['pos'].str.replace(" ", '')

# convert string to list
NT_add['token'] = NT_add['token'].apply(lambda x: x.strip('[]').split(','))
NT_add['lemma'] = NT_add['lemma'].apply(lambda x: x.strip('[]').split(','))
NT_add['pos'] = NT_add['pos'].apply(lambda x: x.strip('[]').split(','))

display(NT_add)

Unnamed: 0,verse,token,lemma,pos
0,HEB 13.1,"[ἡ, φιλαδελφία, μενέτω]","[ὁ, φιλαδελφία, μένω]","[DET, NOUN, VERB]"
1,HEB 13.2,"[τῆς, φιλοξενίας, μὴ, ἐπιλανθάνεσθε, διὰ, ταύτ...","[ὁ, φιλοξενία, μή, ἐπιλανθάνω, διά, οὗτος, γάρ...","[DET, NOUN, ADV, VERB, PREP, ADJ, ADV, VERB, A..."
2,HEB 13.3,"[μιμνῄσκεσθε, τῶν, δεσμίων, ὡς, συνδεδεμένοι, ...","[μιμνῄσκω, ὁ, δεσμίον, ὡς, συνδέομαι, ὁ, κακου...","[VERB, DET, NOUN, ADV, VERB, DET, VERB, ADV, A..."
3,HEB 13.4,"[τίμιος, ὁ, γάμος, ἐν, πᾶσιν, καὶ, ἡ, κοίτη, ἀ...","[τίμιος, ὁ, γάμος, ἐν, πᾶς, καί, ὁ, κοίτη, ἀμί...","[ADJ, DET, NOUN, PREP, ADJ, CON, DET, NOUN, AD..."
4,HEB 13.5,"[ἀφιλάργυρος, ὁ, τρόπος, ἀρκούμενοι, τοῖς, παρ...","[ἀφιλάργυρος, ὁ, τρόπος, ἀρκέω, ὁ, πάρειμι, αὐ...","[ADJ, DET, NOUN, VERB, DET, VERB, PRON, ADV, V..."
...,...,...,...,...
285,JUDE 1.19,"[οὗτοί, εἰσιν, οἱ, ἀποδιορίζοντες, ψυχικοί, πν...","[οὗτος, εἰμί, ὁ, ἀποδιορίζω, ψυχικός, πνεῦμα, ...","[ADJ, VERB, DET, VERB, ADJ, NOUN, ADV, VERB]"
286,JUDE 1.20,"[ὑμεῖς, δέ, ἀγαπητοί, ἐποικοδομοῦντες, ἑαυτοὺς...","[ὑμεῖς, δέ, ἀγαπητός, ἐποικοδομέω, ἑαυτοῦ, ὁ, ...","[PRON, ADV, ADJ, VERB, PRON, DET, ADJ, PRON, N..."
287,JUDE 1.21,"[ἑαυτοὺς, ἐν, ἀγάπῃ, θεοῦ, τηρήσατε, προσδεχόμ...","[ἑαυτοῦ, ἐν, ἀγάπη, θεός, τηράω, προσδέχομαι, ...","[PRON, PREP, NOUN, NOUN, VERB, VERB, DET, NOUN..."
288,JUDE 1.24,"[τῷ, δὲ, δυναμένῳ, φυλάξαι, ὑμᾶς, ἀπταίστους, ...","[ὁ, δέ, δύναμαι, φυλάσσω, ὑμεῖς, ἀπταίστος, κα...","[DET, ADV, VERB, VERB, PRON, ADJ, CON, VERB, P..."


In [8]:
# insert the new rows from NT_add into NT at the correct positions (based on verse numbers manually checked)
NT_merged = pd.concat([NT.iloc[:7077], NT_add.iloc[0:25], NT.iloc[7077:7239],NT_add.iloc[25:255],NT.iloc[7239:7240],NT_add.iloc[255:264],NT.iloc[7240:7241],NT_add.iloc[264:269],NT.iloc[7241:7242],NT_add.iloc[269:288],NT.iloc[7242:7244],NT_add.iloc[288:291],NT[7244:]]).reset_index(drop=True)

NT = NT_merged

# change naming of verses
dict_names = {
    '1COR':'1Co', 
    '1JOHN':'1Jn', 
    '1PET':'1P', 
    '1THESS':'1Th',
    '1TIM':'1Tm',
    '2COR':'2Co',
    '2JOHN': '2Jn',
    '2PET': '2P',
    '2THESS': '2Th',
    '2TIM': '2Tm',
    '3JOHN': '3Jn',
    'ACTS': 'Ac',
    'COL': 'Col',
    'EPH': 'Ep',
    'GAL': 'Ga',
    'HEB': 'He',
    'JAS': 'Ja',
    'JOHN': 'Jn',
    'JUDE': 'Ju',
    'LUKE': 'Lk',
    'MARK': 'Mk',
    'MATT': 'Mt',
    'PHILEM': 'Phm',
    'PHIL': 'Phi',
    'REV':'Re',
    'ROM': 'Ro',
    'TIT': 'Ti',
}

for old, new in dict_names.items():
    NT['verse'] = NT['verse'].str.replace(old, new)
NT['verse'] = NT['verse'].str.replace('.',',')

# normalize lemmas and tokens /!\ due to different encodings Ps/NT, edit distance does not work if accents/spirits are not removed
NT['lemma'] = NT['lemma'].apply(lambda x: [remove_greek_accents(i) for i in x])
NT['lemma'] = NT['lemma'].apply(lambda x: [i.lower() for i in x])
NT['token'] = NT['token'].apply(lambda x: [remove_greek_accents(i) for i in x])
NT['token'] = NT['token'].apply(lambda x: [i.lower() for i in x])

# Display the merged DataFrame
display(NT)

Unnamed: 0,verse,token,lemma,pos
0,"Mt 1,1","[βιβλος, γενεσεως, ιησου, χριστου, υιου, δαυει...","[βιβλος, γενεσις, ιησους, χριστος, υιος, δαυιδ...","[NOUN, NOUN, PROPN, PROPN, NOUN, PROPN, NOUN, ..."
1,"Mt 1,2","[αβρααμ, εγεννησεν, τον, ισαακ, ισαακ, δε, εγε...","[αβρααμ, γενναω, ο, ισαακ, ισαακ, δε, γενναω, ...","[PROPN, VERB, DET, PROPN, PROPN, ADV, VERB, DE..."
2,"Mt 1,3","[ιουδας, δε, εγεννησεν, τον, φαρες, και, τον, ...","[ιουδας, δε, γενναω, ο, φαρες, και, ο, ζαρα, ε...","[PROPN, ADV, VERB, DET, PROPN, CON, DET, PROPN..."
3,"Mt 1,4","[αραμ, δε, εγεννησεν, τον, αμιναδαβ, αμιναδαβ,...","[αραμ, δε, γενναω, ο, αμιναδαβ, αμιναδαβ, δε, ...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER..."
4,"Mt 1,5","[σαλμων, δε, εγεννησεν, τον, βοες, εκ, της, ρα...","[σαλμων, δε, γενναω, ο, βοες, εκ, ο, ραχαβ, βο...","[PROPN, ADV, VERB, DET, PROPN, PREP, DET, PROP..."
...,...,...,...,...
7934,"Re 22,17","[και, το, πνευμα, και, η, νυμφη, λεγουσιν, ερχ...","[και, ο, πνευμα, και, ο, νυμφη, λεγω, ερχομαι,...","[CON, DET, NOUN, CON, DET, NOUN, VERB, VERB, C..."
7935,"Re 22,18","[μαρτυρω, εγω, παντι, τω, ακουοντι, τους, λογο...","[μαρτυρεω, εγω, πας, ο, ακουω, ο, λογος, ο, πρ...","[VERB, PRON, PRON, DET, VERB, DET, NOUN, DET, ..."
7936,"Re 22,19","[και, εαν, τις, αφελη, απο, των, λογων, του, β...","[και, εαν, τις, αφαιρεω, απο, ο, λογος, ο, βιβ...","[CON, CON, PRON, VERB, PREP, DET, NOUN, DET, N..."
7937,"Re 22,20","[λεγει, ο, μαρτυρων, ταυτα, ναι, ερχομαι, ταχυ...","[λεγω, ο, μαρτυρεω, ουτος, ναι, ερχομαι, ταχυς...","[VERB, DET, VERB, PRON, INTJ, VERB, ADJ, INTJ,..."


## Remove stop words and add lexicon column

### /!\ Specific test case: GreCy lemmatization

In [7]:
import spacy as sc
# # Load the Greek model
nlp = sc.load("grc_proiel_trf")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Apply spaCy NLP pipeline to the text

# for i in tqdm(range(0, len(NT))):
#     doc = nlp(" ".join(NT.loc[i,'token']))
#     doc = [token for token in doc if token.text != ' ']

#     # ----------------------- GreCy lemmatisation
#     # NT.loc[:,'lemma'][i] = [token.lemma_ for token in doc]
#     # ------------------------------------------

#     NT.loc[:,'pos'][i] = [token.pos_ for token in doc]

# display(NT.head(20))

In [12]:
# extract list of greek stop words from the model
greek_stop_words = nlp.Defaults.stop_words
greek_stop_words = pd.DataFrame(list(greek_stop_words), columns=['stop_word'])

# normalize the greek stop words to remove accents and convert to lowercase
greek_stop_words['stop_word'] = greek_stop_words['stop_word'].apply(lambda x: remove_greek_accents(x))
greek_stop_words['stop_word'] = greek_stop_words['stop_word'].str.lower()
greek_stop_words = greek_stop_words.sort_values(by='stop_word').reset_index(drop=True)
# Save the stop words to a CSV file
greek_stop_words.to_csv('data/export/greek_stop_words.csv', index=False, encoding='utf-8',sep='\t')

In [11]:
# normalize the 'lemma' column to remove accents and convert to lowercase
NT['lemma_filtered'] = NT['lemma']

# create a 'lemma_filtered' column containing the lemma column without stop words
NT['lemma_filtered'] = NT['lemma_filtered'].apply(lambda x: [word for word in x if word not in greek_stop_words['stop_word'].tolist()])

# create a 'stop_words' column containing the stop words in the lemma column
NT['stop_words'] = NT['lemma'].apply(lambda x: [word for word in x if word in greek_stop_words['stop_word'].tolist()])

# add a column 'lexicon' containing the unique lemma_filtered of the verse in alphabetical order
NT['lexicon'] = NT['lemma_filtered'].apply(lambda x: sorted(set(x)))

display(NT)

Unnamed: 0,verse,token,lemma,pos,lemma_filtered,stop_words,lexicon
0,"Mt 1,1","[βιβλος, γενεσεως, ιησου, χριστου, υιου, δαυει...","[βιβλος, γενεσις, ιησους, χριστος, υιος, δαυιδ...","[NOUN, NOUN, PROPN, PROPN, NOUN, PROPN, NOUN, ...","[βιβλος, γενεσις, ιησους, χριστος, υιος, δαυιδ...",[],"[αβρααμ, βιβλος, γενεσις, δαυιδ, ιησους, υιος,..."
1,"Mt 1,2","[αβρααμ, εγεννησεν, τον, ισαακ, ισαακ, δε, εγε...","[αβρααμ, γενναω, ο, ισαακ, ισαακ, δε, γενναω, ...","[PROPN, VERB, DET, PROPN, PROPN, ADV, VERB, DE...","[αβρααμ, γενναω, ισαακ, ισαακ, γενναω, ιακωβ, ...","[ο, δε, ο, δε, ο, και, ο, αυτος]","[αβρααμ, αδελφος, γενναω, ιακωβ, ιουδας, ισαακ]"
2,"Mt 1,3","[ιουδας, δε, εγεννησεν, τον, φαρες, και, τον, ...","[ιουδας, δε, γενναω, ο, φαρες, και, ο, ζαρα, ε...","[PROPN, ADV, VERB, DET, PROPN, CON, DET, PROPN...","[ιουδας, γενναω, φαρες, ζαρα, θαμαρ, φαρες, γε...","[δε, ο, και, ο, εκ, ο, δε, ο, δε, ο]","[αραμ, γενναω, εσρωμ, ζαρα, θαμαρ, ιουδας, φαρες]"
3,"Mt 1,4","[αραμ, δε, εγεννησεν, τον, αμιναδαβ, αμιναδαβ,...","[αραμ, δε, γενναω, ο, αμιναδαβ, αμιναδαβ, δε, ...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER...","[αραμ, γενναω, αμιναδαβ, αμιναδαβ, γενναω, ναα...","[δε, ο, δε, ο, δε, ο]","[αμιναδαβ, αραμ, γενναω, ναασσων, σαλμων]"
4,"Mt 1,5","[σαλμων, δε, εγεννησεν, τον, βοες, εκ, της, ρα...","[σαλμων, δε, γενναω, ο, βοες, εκ, ο, ραχαβ, βο...","[PROPN, ADV, VERB, DET, PROPN, PREP, DET, PROP...","[σαλμων, γενναω, βοες, ραχαβ, βοες, γενναω, ιω...","[δε, ο, εκ, ο, δε, ο, εκ, ο, δε, ο]","[βοες, γενναω, ιεσσαι, ιωβηδ, ραχαβ, ρουθ, σαλ..."
...,...,...,...,...,...,...,...
7934,"Re 22,17","[και, το, πνευμα, και, η, νυμφη, λεγουσιν, ερχ...","[και, ο, πνευμα, και, ο, νυμφη, λεγω, ερχομαι,...","[CON, DET, NOUN, CON, DET, NOUN, VERB, VERB, C...","[πνευμα, νυμφη, λεγω, ερχομαι, ακουω, λεγω, ερ...","[και, ο, και, ο, και, ο, και, ο, ο]","[ακουω, διψαω, δωρεαν, εθελω, ερχομαι, ζωη, λα..."
7935,"Re 22,18","[μαρτυρω, εγω, παντι, τω, ακουοντι, τους, λογο...","[μαρτυρεω, εγω, πας, ο, ακουω, ο, λογος, ο, πρ...","[VERB, PRON, PRON, DET, VERB, DET, NOUN, DET, ...","[μαρτυρεω, πας, ακουω, λογος, προφητεια, βιβλι...","[εγω, ο, ο, ο, ο, ουτος, εαν, τις, επι, αυτος,...","[ακουω, βιβλιον, γραφω, επιτιθημι, θεος, λογος..."
7936,"Re 22,19","[και, εαν, τις, αφελη, απο, των, λογων, του, β...","[και, εαν, τις, αφαιρεω, απο, ο, λογος, ο, βιβ...","[CON, CON, PRON, VERB, PREP, DET, NOUN, DET, N...","[αφαιρεω, λογος, βιβλιον, προφητεια, αφαιρεω, ...","[και, εαν, τις, απο, ο, ο, ο, ουτος, ο, ο, αυτ...","[αγιος, αφαιρεω, βιβλιον, γραφω, ζωη, θεος, λο..."
7937,"Re 22,20","[λεγει, ο, μαρτυρων, ταυτα, ναι, ερχομαι, ταχυ...","[λεγω, ο, μαρτυρεω, ουτος, ναι, ερχομαι, ταχυς...","[VERB, DET, VERB, PRON, INTJ, VERB, ADJ, INTJ,...","[λεγω, μαρτυρεω, ναι, ερχομαι, ταχυς, αμην, ερ...","[ο, ουτος]","[αμην, ερχομαι, ιησους, κυριος, λεγω, μαρτυρεω..."


## Loading and integrating Trench's synonyms database

In [12]:
synonyms = pd.read_csv('data/Greek_synonyms.csv',sep='\t',dtype=str)
synonyms['terms'] = synonyms['terms'].apply(lambda x: remove_greek_accents(x))
synonyms['terms'] = synonyms['terms'].str.lower()
synonyms['terms'] = synonyms['terms'].apply(lambda x: x.split(','))
synonyms['terms'] = synonyms['terms'].apply(lambda x: [s.strip() for s in x])

list_synonyms = list(synonyms['terms'].explode().unique())

for i,s in enumerate(list_synonyms):
    # find all indices of synonyms['terms'] containing s
    indices = [j for j, sublist in enumerate(synonyms['terms']) if s in sublist]
    if len(indices) > 1:
        combined_list = list(set([item for idx in indices for item in synonyms['terms'][idx]]))
        combined_list = sorted(combined_list)
        for idx in indices:
            synonyms.at[idx, 'terms'] = combined_list
synonyms = synonyms.drop_duplicates(subset=['terms']).reset_index(drop=True)
synonyms['terms'] = synonyms['terms'].apply(lambda x: sorted(x))

display(synonyms['terms'])

0            [εκκλησια, πανηγυρις, συναγωγη]
1                          [θειοτης, θεοτης]
2                              [ιερον, ναος]
3                         [ελεγχω, επιτιμαω]
4                         [αναθεμα, αναθημα]
                       ...                  
111                [εδραιος, τεθεμελιωμενος]
112                         [θνητος, νεκρος]
113                  [καταλαλος, ψιθυριστης]
114                      [αχρειος, αχρηστος]
115    [γραμματευς, νομικος, νομοδιδασκαλος]
Name: terms, Length: 116, dtype: object

## Loading and integrating Louw-Nida database

In [13]:
# Function to clean xml:id attributes
def clean_xml_ids(xml_content):
    # Replace invalid characters in xml:id attributes
    cleaned_content = xml_content.replace('xml:id="RHE_Ps_147_12_c _', 'xml:id="RHE_Ps_147_12_c_')
    return cleaned_content

# Read the XML file content
with open('data/UBSGreekNTDic-v1.0-en.xml', 'r', encoding='utf-8') as file:
    xml_content = file.read()

# Parse the cleaned XML content
tree = etree.parse('data/UBSGreekNTDic-v1.0-en.xml')
root = tree.getroot()

In [None]:
UBS = pd.DataFrame(columns=['lemma', 'domain', 'subdomain','definition'])

lemma_list = []
domain_list = [[]]
subdomain_list = [[]]

i = -1
j = -1
for elem in root.iter():
    if elem.tag == 'Lexicon_Entry':
        lemma_list.append(str(elem.attrib.get('Lemma')))
        domain_list.append([])
        subdomain_list.append([])
        i +=1
    if elem.tag == 'LEXDomain':
        domain_list[i].append(str(elem.text))
    if elem.tag == 'LEXSubDomain':
        subdomain_list[i].append(str(elem.text))

# delete last line of domain_list and subdomain_list
domain_list = domain_list[:-1]
subdomain_list = subdomain_list[:-1]

# include lemma_list in UBS
UBS['lemma'] = lemma_list
UBS['domain'] = domain_list
UBS['subdomain'] = subdomain_list

# normalize lemmas
UBS['lemma'] = UBS['lemma'].str.lower()
UBS['lemma'] = UBS['lemma'].apply(remove_greek_accents)

display(UBS)

Unnamed: 0,lemma,domain,subdomain,definition
0,α,[Number],"[First, Second, Third, Etc. [Ordinals]]","[first in a series involving time, space, or set]"
1,ααρων,[Names of Persons and Places],[Persons],[the elder brother of Moses and Israel’s first...
2,αβαδδων,[Names of Persons and Places],[Persons],[the Hebrew name for the ruling angel in Hell;...
3,αβαρης,"[Possess, Transfer, Exchange]",[Be a Financial Burden],[(a figurative extension of meaning of ἀβαρής ...
4,αββα,[Supernatural Beings and Powers],[Supernatural Beings],[(title for God; a Greek transliteration of an...
...,...,...,...,...
5502,ωταριον,"[Body, Body Parts, and Body Products]",[Parts of the Body],[None]
5503,ωτιον,"[Body, Body Parts, and Body Products]",[Parts of the Body],[None]
5504,ωφελεια,[Value],"[Advantageous, Not Advantageous]",[the state of having acquired an advantage or ...
5505,ωφελεω,"[Help, Care For, Aspect]","[Help, Complete, Finish, Succeed]","[to provide assistance, with emphasis upon the..."


In [None]:
# Add a column 'domain' to NT dataframe containing a list of UBS['domain'] corresponding to the lemmas listed in each row of NT dataframes

NT['domain'] = [[] for _ in range(len(NT))]
NT['subdomain'] = [[] for _ in range(len(NT))]

for i in tqdm(range(len(NT['lexicon']))):
    for j in range(len(NT['lexicon'].values[i])):
        lemma = NT['lexicon'].values[i][j]
        lemma = remove_greek_accents(lemma)
        # find lemma in UBS dataframe
        if len(UBS[UBS['lemma'] == lemma]) > 0:
            NT['domain'].values[i].append(UBS[UBS['lemma'] == lemma]['domain'].values[0])
            NT['subdomain'].values[i].append(UBS[UBS['lemma'] == lemma]['subdomain'].values[0])
        else:
            NT['domain'].values[i].append([''])
            NT['subdomain'].values[i].append([''])
display(NT)

100%|██████████| 7939/7939 [01:07<00:00, 118.42it/s]


Unnamed: 0,verse,token,lemma,pos,lemma_filtered,stop_words,lexicon,domain,subdomain
0,"Mt 1,1","[βιβλος, γενεσεως, ιησου, χριστου, υιου, δαυει...","[βιβλος, γενεσις, ιησους, χριστος, υιος, δαυιδ...","[NOUN, NOUN, PROPN, PROPN, NOUN, PROPN, NOUN, ...","[βιβλος, γενεσις, ιησους, χριστος, υιος, δαυιδ...",[],"[αβρααμ, βιβλος, γενεσις, δαυιδ, ιησους, υιος,...","[[Names of Persons and Places, Geographical Ob...","[[Persons, Regions Above the Earth], [Instrume..."
1,"Mt 1,2","[αβρααμ, εγεννησεν, τον, ισαακ, ισαακ, δε, εγε...","[αβρααμ, γενναω, ο, ισαακ, ισαακ, δε, γενναω, ...","[PROPN, VERB, DET, PROPN, PROPN, ADV, VERB, DE...","[αβρααμ, γενναω, ισαακ, ισαακ, γενναω, ιακωβ, ...","[ο, δε, ο, δε, ο, και, ο, αυτος]","[αβρααμ, αδελφος, γενναω, ιακωβ, ιουδας, ισαακ]","[[Names of Persons and Places, Geographical Ob...","[[Persons, Regions Above the Earth], [Kinship ..."
2,"Mt 1,3","[ιουδας, δε, εγεννησεν, τον, φαρες, και, τον, ...","[ιουδας, δε, γενναω, ο, φαρες, και, ο, ζαρα, ε...","[PROPN, ADV, VERB, DET, PROPN, CON, DET, PROPN...","[ιουδας, γενναω, φαρες, ζαρα, θαμαρ, φαρες, γε...","[δε, ο, και, ο, εκ, ο, δε, ο, δε, ο]","[αραμ, γενναω, εσρωμ, ζαρα, θαμαρ, ιουδας, φαρες]","[[Names of Persons and Places], [Physiological...","[[Persons], [Birth, Procreation, Birth, Procre..."
3,"Mt 1,4","[αραμ, δε, εγεννησεν, τον, αμιναδαβ, αμιναδαβ,...","[αραμ, δε, γενναω, ο, αμιναδαβ, αμιναδαβ, δε, ...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER...","[αραμ, γενναω, αμιναδαβ, αμιναδαβ, γενναω, ναα...","[δε, ο, δε, ο, δε, ο]","[αμιναδαβ, αραμ, γενναω, ναασσων, σαλμων]","[[Names of Persons and Places], [Names of Pers...","[[Persons], [Persons], [Birth, Procreation, Bi..."
4,"Mt 1,5","[σαλμων, δε, εγεννησεν, τον, βοες, εκ, της, ρα...","[σαλμων, δε, γενναω, ο, βοες, εκ, ο, ραχαβ, βο...","[PROPN, ADV, VERB, DET, PROPN, PREP, DET, PROP...","[σαλμων, γενναω, βοες, ραχαβ, βοες, γενναω, ιω...","[δε, ο, εκ, ο, δε, ο, εκ, ο, δε, ο]","[βοες, γενναω, ιεσσαι, ιωβηδ, ραχαβ, ρουθ, σαλ...","[[Names of Persons and Places, Names of Person...","[[Persons, Persons], [Birth, Procreation, Birt..."
...,...,...,...,...,...,...,...,...,...
7934,"Re 22,17","[και, το, πνευμα, και, η, νυμφη, λεγουσιν, ερχ...","[και, ο, πνευμα, και, ο, νυμφη, λεγω, ερχομαι,...","[CON, DET, NOUN, CON, DET, NOUN, VERB, VERB, C...","[πνευμα, νυμφη, λεγω, ερχομαι, ακουω, λεγω, ερ...","[και, ο, και, ο, και, ο, και, ο, ο]","[ακουω, διψαω, δωρεαν, εθελω, ερχομαι, ζωη, λα...","[[Sensory Events and States, Sensory Events an...","[[Hear, Hear, Inform, Announce, Accept As True..."
7935,"Re 22,18","[μαρτυρω, εγω, παντι, τω, ακουοντι, τους, λογο...","[μαρτυρεω, εγω, πας, ο, ακουω, ο, λογος, ο, πρ...","[VERB, PRON, PRON, DET, VERB, DET, NOUN, DET, ...","[μαρτυρεω, πας, ακουω, λογος, προφητεια, βιβλι...","[εγω, ο, ο, ο, ο, ουτος, εαν, τις, επι, αυτος,...","[ακουω, βιβλιον, γραφω, επιτιθημι, θεος, λογος...","[[Sensory Events and States, Sensory Events an...","[[Hear, Hear, Inform, Announce, Accept As True..."
7936,"Re 22,19","[και, εαν, τις, αφελη, απο, των, λογων, του, β...","[και, εαν, τις, αφαιρεω, απο, ο, λογος, ο, βιβ...","[CON, CON, PRON, VERB, PREP, DET, NOUN, DET, N...","[αφαιρεω, λογος, βιβλιον, προφητεια, αφαιρεω, ...","[και, εαν, τις, απο, ο, ο, ο, ουτος, ο, ο, αυτ...","[αγιος, αφαιρεω, βιβλιον, γραφω, ζωη, θεος, λο...",[[Moral and Ethical Qualities and Related Beha...,"[[Holy, Pure, Dedicate, Consecrate, Socio-Reli..."
7937,"Re 22,20","[λεγει, ο, μαρτυρων, ταυτα, ναι, ερχομαι, ταχυ...","[λεγω, ο, μαρτυρεω, ουτος, ναι, ερχομαι, ταχυς...","[VERB, DET, VERB, PRON, INTJ, VERB, ADJ, INTJ,...","[λεγω, μαρτυρεω, ναι, ερχομαι, ταχυς, αμην, ερ...","[ο, ουτος]","[αμην, ερχομαι, ιησους, κυριος, λεγω, μαρτυρεω...","[[True, False], [Linear Movement, Linear Movem...","[[True, False], [Move, Come/Go, Come, Come To,..."


In [None]:
def concatenate_all_lists(list_of_lists):
    if len(list_of_lists) < 2:
        if list_of_lists == []:
            return list_of_lists
        else:
            return list_of_lists[0]
    else:
        list_of_lists[0] = list_of_lists[0] + list_of_lists[1]
        del list_of_lists[1]
        return concatenate_all_lists(list_of_lists)    

for i in tqdm(range(len(NT['domain']))):
    NT['domain'].values[i] = concatenate_all_lists(NT['domain'].values[i])
    NT['subdomain'].values[i] = concatenate_all_lists(NT['subdomain'].values[i])

    # delete '' from domain and subdomain
    NT['domain'].values[i] = [x for x in NT['domain'].values[i] if x != '']
    NT['subdomain'].values[i] = [x for x in NT['subdomain'].values[i] if x != '']

    NT['domain'].values[i] = list(set(NT['domain'].values[i]))
    NT['subdomain'].values[i] = list(set(NT['subdomain'].values[i]))

display(NT)

  0%|          | 0/7939 [00:00<?, ?it/s]

100%|██████████| 7939/7939 [00:00<00:00, 14634.89it/s]


Unnamed: 0,verse,token,lemma,pos,lemma_filtered,stop_words,lexicon,domain,subdomain
0,"Mt 1,1","[βιβλος, γενεσεως, ιησου, χριστου, υιου, δαυει...","[βιβλος, γενεσις, ιησους, χριστος, υιος, δαυιδ...","[NOUN, NOUN, PROPN, PROPN, NOUN, PROPN, NOUN, ...","[βιβλος, γενεσις, ιησους, χριστος, υιος, δαυιδ...",[],"[αβρααμ, βιβλος, γενεσις, δαυιδ, ιησους, υιος,...","[Kinship Terms, Supernatural Beings and Powers...","[Socio-Political, Kinship Relations Involving ..."
1,"Mt 1,2","[αβρααμ, εγεννησεν, τον, ισαακ, ισαακ, δε, εγε...","[αβρααμ, γενναω, ο, ισαακ, ισαακ, δε, γενναω, ...","[PROPN, VERB, DET, PROPN, PROPN, ADV, VERB, DE...","[αβρααμ, γενναω, ισαακ, ισαακ, γενναω, ιακωβ, ...","[ο, δε, ο, δε, ο, και, ο, αυτος]","[αβρααμ, αδελφος, γενναω, ιακωβ, ιουδας, ισαακ]","[Kinship Terms, Be, Become, Exist, Happen, Geo...","[Socio-Political, Socio-Religious, Birth, Proc..."
2,"Mt 1,3","[ιουδας, δε, εγεννησεν, τον, φαρες, και, τον, ...","[ιουδας, δε, γενναω, ο, φαρες, και, ο, ζαρα, ε...","[PROPN, ADV, VERB, DET, PROPN, CON, DET, PROPN...","[ιουδας, γενναω, φαρες, ζαρα, θαμαρ, φαρες, γε...","[δε, ο, και, ο, εκ, ο, δε, ο, δε, ο]","[αραμ, γενναω, εσρωμ, ζαρα, θαμαρ, ιουδας, φαρες]","[Names of Persons and Places, Be, Become, Exis...","[Birth, Procreation, Change Behavior, Change o..."
3,"Mt 1,4","[αραμ, δε, εγεννησεν, τον, αμιναδαβ, αμιναδαβ,...","[αραμ, δε, γενναω, ο, αμιναδαβ, αμιναδαβ, δε, ...","[PROPN, ADV, VERB, DET, PROPN, PROPN, ADV, VER...","[αραμ, γενναω, αμιναδαβ, αμιναδαβ, γενναω, ναα...","[δε, ο, δε, ο, δε, ο]","[αμιναδαβ, αραμ, γενναω, ναασσων, σαλμων]","[Names of Persons and Places, Be, Become, Exis...","[Birth, Procreation, Change Behavior, Change o..."
4,"Mt 1,5","[σαλμων, δε, εγεννησεν, τον, βοες, εκ, της, ρα...","[σαλμων, δε, γενναω, ο, βοες, εκ, ο, ραχαβ, βο...","[PROPN, ADV, VERB, DET, PROPN, PREP, DET, PROP...","[σαλμων, γενναω, βοες, ραχαβ, βοες, γενναω, ιω...","[δε, ο, εκ, ο, δε, ο, εκ, ο, δε, ο]","[βοες, γενναω, ιεσσαι, ιωβηδ, ραχαβ, ρουθ, σαλ...","[Names of Persons and Places, Be, Become, Exis...","[Birth, Procreation, Change Behavior, Change o..."
...,...,...,...,...,...,...,...,...,...
7934,"Re 22,17","[και, το, πνευμα, και, η, νυμφη, λεγουσιν, ερχ...","[και, ο, πνευμα, και, ο, νυμφη, λεγω, ερχομαι,...","[CON, DET, NOUN, CON, DET, NOUN, VERB, VERB, C...","[πνευμα, νυμφη, λεγω, ερχομαι, ακουω, λεγω, ερ...","[και, ο, και, ο, και, ο, και, ο, ο]","[ακουω, διψαω, δωρεαν, εθελω, ερχομαι, ζωη, λα...","[Be, Become, Exist, Happen, Courts and Legal P...","[Be Eager, Be Earnest, In a Devoted Manner, Re..."
7935,"Re 22,18","[μαρτυρω, εγω, παντι, τω, ακουοντι, τους, λογο...","[μαρτυρεω, εγω, πας, ο, ακουω, ο, λογος, ο, πρ...","[VERB, PRON, PRON, DET, VERB, DET, NOUN, DET, ...","[μαρτυρεω, πας, ακουω, λογος, προφητεια, βιβλι...","[εγω, ο, ο, ο, ο, ουτος, εαν, τις, επι, αυτος,...","[ακουω, βιβλιον, γραφω, επιτιθημι, θεος, λογος...","[Hostility, Strife, Violence, Harm, Destroy, K...","[Cause Trouble, Hardship, Sickness, Disease, W..."
7936,"Re 22,19","[και, εαν, τις, αφελη, απο, των, λογων, του, β...","[και, εαν, τις, αφαιρεω, απο, ο, λογος, ο, βιβ...","[CON, CON, PRON, VERB, PREP, DET, NOUN, DET, N...","[αφαιρεω, λογος, βιβλιον, προφητεια, αφαιρεω, ...","[και, εαν, τις, απο, ο, ο, ο, ουτος, ο, ο, αυτ...","[αγιος, αφαιρεω, βιβλιον, γραφω, ζωη, θεος, λο...","[Existence in Space, Be, Become, Exist, Happen...","[State, Socio-Political, Socio-Religious, Keep..."
7937,"Re 22,20","[λεγει, ο, μαρτυρων, ταυτα, ναι, ερχομαι, ταχυ...","[λεγω, ο, μαρτυρεω, ουτος, ναι, ερχομαι, ταχυς...","[VERB, DET, VERB, PRON, INTJ, VERB, ADJ, INTJ,...","[λεγω, μαρτυρεω, ναι, ερχομαι, ταχυς, αμην, ερ...","[ο, ουτος]","[αμην, ερχομαι, ιησους, κυριος, λεγω, μαρτυρεω...","[Supernatural Beings and Powers, Linear Moveme...","[Speak, Talk, Have an Opinion, Hold a View, Mo..."


## CSV and PKL export

In [None]:
# Save the NT dataframe to a CSV file
NT.to_csv('data/NT.csv', index=False,sep='\t', encoding='utf-8')

import pickle
with open("data/NT.pkl", "wb") as file:
    pickle.dump(NT, file, protocol=pickle.HIGHEST_PROTOCOL)