In [8]:
import csv
import pandas as pd
import unicodedata
import os


In [9]:
STEP_LEXICON_PATH = 'STEP Data Source/step/step-core-data/src/main/resources/com/tyndalehouse/step/core/data/create/lexicon/lexicon_hebrew.txt'
STEP_CORPORA_PATH = 'STEP Data Source/STEPBible-Data'

STEP_CORPORA_PREFIX = 'TOTHT'
# STEP_CORPORA_HEADER = ['Ref in Heb', 'Eng ref', 'Pointed', 'Accented', 'Morphology', 'Extended Strongs']

HEB_REF_ATTR = 'hebrewRef'
ENG_REF_ATTR = 'englishRef'
TEXT_ATTR = 'text'
TEXT_QERE_ATTR = 'textQere'
TRAILER_ATTR = 'trailer'
GLOSS_ATTR = 'gloss'
SENSE_GLOSS_ATTR = 'senseGloss'
MORPH_ATTR = 'morph'
STRONGS_ATTR = 'strongs'
TRAILER_STRONGS_ATTR = 'trailerStrongs'

STEP_CORPORA_HEADER = [HEB_REF_ATTR, ENG_REF_ATTR, TEXT_ATTR, TEXT_QERE_ATTR, TRAILER_ATTR, GLOSS_ATTR, SENSE_GLOSS_ATTR, MORPH_ATTR, STRONGS_ATTR, TRAILER_STRONGS_ATTR]

STEP_DATA_DEST = 'STEP Data Destination'
STEP_CORPUS = 'TOTHT.csv'
STEP_CORPUS_WITH_QERE = 'TOTHT-with-qere.csv'

MACULA_CORPUS = 'macula-hebrew.tsv'
ETCBC_CORPUS = 'word.csv'

In [234]:
class StepBibleHebrewDataProcessor:

    trailers = ['׃', 'פ', '׀', 'ס', ' פ', '׆', '־']
    # maqef = '־'

    def __init__(self, corpora_files_path, lexicon_file_path):
        
        self.lexicon_file_path = lexicon_file_path
        self.corpora_files_path = corpora_files_path
        self.corpora_files_dict = self.get_corpora_dict()

    def get_corpora_dict(self):

        # Assuming file format 'TOTHT Gen-Deu - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt'
        corpora_dict = {
            'Gen': '',
            'Jos': '',
            'Job': '',
            'Isa': ''
        }

        # Assign the files to the references in the dictionary.
        for file in os.listdir(self.corpora_files_path):

            if STEP_CORPORA_PREFIX in file:

                for ref in corpora_dict.keys():

                    if ref in file:

                        corpora_dict[ref] = file

        return corpora_dict

    # Write all corpora files into a single corpus csv.
    def write_corpora_data(self, with_qere=False):

        rows = []   

        for ref, file in self.corpora_files_dict.items():

            file_path = os.path.join(STEP_CORPORA_PATH, file)
            # Track when we've arrived at the Hebrew content in the file. 
            atData = False
        
            with open(file_path) as f:

                lines = [line.rstrip('\n') for line in f]
                for line_i, line in enumerate(lines):

                    row = line.split('\t')

                    # E.g., "Gen.1.1-01	Gen.1.1-01	בְּרֵאשִׁית	בְּ/רֵאשִׁ֖ית	HR/Ncfsa	H9003=ב=in/H7225=רֵאשִׁית=first_§1_beginning"
                    if len(row) > 1 and ref in row[0]:
                        atData = True

                    if atData:
                        
                        # Don't include qere data.
                        if len(row) == 6 and '.Q' not in row[0]:
                            
                            try:
                                words = row[3].split('/')
                                morph_codes = row[4].split('/')
                                strongs_data = row[5].split('/')

                            except Exception as e: 
                                print(e, line_i, row)

                            word_count = 0

                            data = {}

                            for i, word in enumerate(words): 

                                if word == '':
                                    continue

                                try:

                                    _strongs_data = strongs_data[i].split('=')
                                    strongs_number = _strongs_data[0]
                                    
                                    gloss_data = _strongs_data[-1].split('_')
                                    gloss = gloss_data[0]
                                    sense_gloss = None 
                                    if len(gloss_data) == 2:
                                        sense_gloss = gloss_data[1]
                                    elif len(gloss_data) == 3:
                                        sense_gloss = gloss_data[1] + '.' + gloss_data[2]
                                
                                except Exception as e: 
                                    print(e, line_i, i, word, row, words, morph_codes)

                                # Faulty data:
                                # 2Ki.7.15-14.K	2Ki.7.15-14k	בְּהֵחָפְזָם	בְּ/הֵ/חָפְזָ/ם	HR/VNcc/Sp3mp	H9003=ב=in/H9009#1=ה=the/H2648=חָפַז=to hurry/H9048=Sp3m=they
                                if strongs_number == 'H9009#1':
                                    continue

                                if word in self.trailers:

                                    data[TRAILER_ATTR] += word 
                                    data[TRAILER_STRONGS_ATTR] = strongs_number

                                else:

                                    if data.get(TEXT_ATTR) != None:
                                        rows.append(data)

                                    data = {}

                                    try:

                                        morph = morph_codes[word_count] if len(words) > len(morph_codes) else morph_codes[i]

                                        data[HEB_REF_ATTR] = row[0]
                                        data[ENG_REF_ATTR] = row[1]
                                        data[TEXT_ATTR] = word 
                                        data[TRAILER_ATTR] = '' if i != len(words) - 1 else ' '
                                        data[GLOSS_ATTR] = gloss 
                                        data[SENSE_GLOSS_ATTR] = sense_gloss
                                        data[MORPH_ATTR] = morph
                                        data[STRONGS_ATTR] = strongs_number
                                        data[TRAILER_STRONGS_ATTR] = None

                                        if with_qere and '.K' in row[0]:
                                            qere_data = lines[i+1].split('\t')

                                            # TODO data[]

                                    except Exception as e: 
                                        print(e, line_i, i, word, row)

                                    word_count += 1

                            # if with_qere and '.K' in row[0]:
                            #     data[]
                            
                            rows.append(data)

            print('Complete: ' + file)

        # Write the data.
        df = pd.DataFrame(rows)
        write_file = STEP_CORPUS_WITH_QERE if with_qere else STEP_CORPUS
        save_path = os.path.join(STEP_DATA_DEST, write_file)
        df.to_csv(save_path, sep=',', encoding='utf-8')

In [235]:
# a = StepBibleHebrewDataProcessor(STEP_CORPORA_PATH, STEP_LEXICON_PATH)
# a.write_corpora_data()
# print(a.corpora_files_dict)

Complete: TOTHT Gen-Deu - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt
Complete: TOTHT Jos-Est - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt
Complete: TOTHT Job-Sng - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt
Complete: TOTHT Isa-Mal - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt


In [10]:
def hebStripped(word):

    normalized = unicodedata.normalize('NFKD', word)

    return ''.join([c for c in normalized if not unicodedata.combining(c)])

In [12]:
class WordFileParser:

    og_crawl_depth = 1
    # According to index == case.
    word_cases = ["Same", "Dif markings", "Dif spelling", "Dif word, next same", "Dif word, next not same"]

    def __init__(self, file:str, word_col:str, ref_col:str, name:str):

        self.df = pd.read_csv(
            file, 
            sep=self.__get_sep(file), 
            na_filter=False,
            encoding='utf-8',
            usecols=[word_col, ref_col]
            ).astype(str)
        self.words = self.df[word_col].to_list()
        self.refs = self.df[ref_col].to_list()
        self.words_output = []
        self.refs_output = []
        self.cases_output = []
        self.i = 0
        self.crawl_depth = self.og_crawl_depth
        self.length = len(self.words)
        self.name = name


    def __get_sep(self, file:str) -> str:

        if file.endswith('.csv'):
            return ','
        elif file.endswith('.tsv'):
            return '\t'
        # TODO raise error


    def word(self, i:int=None) -> str:

        if i:
            return self.words[i]

        return self.words[self.i]


    def ref(self, i:int=None) -> str:

        if i:
            return self.refs[i]

        return self.refs[self.i]


    def reset_crawl_depth(self):

        self.crawl_depth = self.og_crawl_depth


    def update_output_lists(self, values:list):
        
        self.words_output.append(values[0])
        self.refs_output.append(values[1])
        self.cases_output.append(values[2])


    def word_comparison(self, other_wfp:'WordFileParser', new_index:int=0) -> int:
        
        other_index = max(other_wfp.i, new_index)
        word_a = self.word()
        word_b = other_wfp.word(other_index)

        if word_a == word_b:
            return 0

        else:

            word_a_cons = hebStripped(word_a)
            word_b_cons = hebStripped(word_b)

            if word_a_cons == word_b_cons:
                return 1

            elif 0 in [len(word_a_cons), len(word_b_cons)]:
                return 4

            elif len(word_a_cons) > 1 and len(word_b_cons) > 1 and word_a_cons[0] == word_b_cons[0]:

                if word_a_cons[-1] == word_b_cons[-1] or len(word_a_cons) == len(word_b_cons):
                    return 1
                
                else:
                    return 2

            elif self.i + 1 < self.length and other_wfp.i + 1 < other_wfp.length \
            and hebStripped(self.word(self.i+1)) == hebStripped(other_wfp.word(other_index+1)):
                return 3

            else:
                return 4


    def crawl(self, other_wfp:'WordFileParser', again=False):
        
        depth = 0
        runner_index = other_wfp.i + 1

        while depth < self.crawl_depth and runner_index < other_wfp.length:

            comp = self.word_comparison(other_wfp, new_index=runner_index)

            if comp == 0 or (again and comp in [1,2,3]):
                self.update_comparisons(other_wfp, comp, runner_index)
                return True
                
            runner_index += 1
            depth += 1
        
        return False

            
    def update_comparisons(self, other_wfp:'WordFileParser', comp:int, new_index:int=0):

        while other_wfp.i < new_index:
            other_wfp.update_output_lists([other_wfp.word(), other_wfp.ref(), 4])
            self.update_output_lists(['NA', self.ref(), 4])
            other_wfp.i += 1
            
        if comp in range(4):

            other_wfp.update_output_lists([other_wfp.word(), other_wfp.ref(), comp])
            self.update_output_lists([self.word(), self.ref(), comp])
            self.i += 1
            other_wfp.i += 1

            return True
        
        else:
            return None


    def add_row(self, other_wfp:'WordFileParser'):
        
        other_wfp.update_output_lists([other_wfp.word(), other_wfp.ref(), 5])
        self.update_output_lists([self.word(), self.ref(), 5])
        self.i += 1
        other_wfp.i += 1

In [13]:
step_file = os.path.join(STEP_DATA_DEST, STEP_CORPUS)
step_word_col = 'text'
step_ref_col = 'hebrewRef'

macula_file = os.path.join("", MACULA_CORPUS)
macula_word_col = 'text'
macula_ref_col = 'ref'

etcbc_file = os.path.join("", ETCBC_CORPUS)
etcbc_word_col = 'text'
etcbc_ref_col = 'vsIdBHS'

In [14]:
def compare_data(wfp1:WordFileParser, wfp2:WordFileParser):
    
    while wfp1.i < wfp1.length and wfp2.i < wfp2.length:
        
        comp = wfp1.word_comparison(wfp2)
        
        if not wfp1.update_comparisons(wfp2, comp):

            while wfp1.crawl_depth <= 3:
                if wfp1.crawl(wfp2):
                    break
                elif wfp2.crawl(wfp1):
                    break
                elif wfp1.crawl(wfp2, again=True):
                    break
                elif wfp2.crawl(wfp1, again=True):
                    break
                wfp1.crawl_depth += 1
                wfp2.crawl_depth += 1
            
            else:
                wfp1.add_row(wfp2)
                # print("ERROR", wfp1.crawl_dist, wfp1.i, wfp1.word(), wfp2.i, wfp2.word())
                # return table

            wfp1.reset_crawl_depth()
            wfp2.reset_crawl_depth()

        if wfp1.i % 50000 < 1:
            print(wfp1.i, wfp1.word(), wfp2.i, wfp2.word())

    table = {
        f"{wfp1.name}Ref": wfp1.refs_output,
        f"{wfp1.name}Text": wfp1.words_output,
        f"{wfp2.name}Text": wfp2.words_output,
        f"{wfp2.name}Ref": wfp2.refs_output,
        "code": wfp2.cases_output,
    }

    return pd.DataFrame(table).astype(str)

In [15]:
step_wfp = WordFileParser(step_file, step_word_col, step_ref_col, 'step')
macula_wfp = WordFileParser(macula_file, macula_word_col, macula_ref_col, 'macula')
etcbc_wfp = WordFileParser(etcbc_file, etcbc_word_col, etcbc_ref_col, 'etcbc')

In [418]:
# df = compare_data(macula_wfp, step_wfp)
# df = compare_data(step_wfp, macula_wfp)
df = compare_data(etcbc_wfp, macula_wfp)
# df = compare_data(step_wfp, etcbc_wfp)

write_file = 'comp10.csv'
df.to_csv(write_file, encoding='utf-8', index=False)

50000 בַּדִּים֙ 51272 לְ
200000 עַבְדֹּ֑ו 202872 וַ


In [2]:
def nw_align(a, b, replace_func=lambda x, y: -1 if x != y else 0, insert=-1, delete=-1):
    ZERO, LEFT, UP, DIAGONAL = 0, 1, 2, 3
    len_a, len_b = len(a), len(b)
    matrix = [[(0, ZERO) for x in range(len_b + 1)] for y in range(len_a + 1)]
    for i in range(len_a + 1):
        matrix[i][0] = (insert * i, UP)
    for j in range(len_b + 1):
        matrix[0][j] = (delete * j, LEFT)
    for i in range(1, len_a + 1):
        for j in range(1, len_b + 1):
            replace = replace_func(a[i - 1], b[j - 1])
            matrix[i][j] = max(
                [
                    (matrix[i - 1][j - 1][0] + replace, DIAGONAL),
                    (matrix[i][j - 1][0] + insert, LEFT),
                    (matrix[i - 1][j][0] + delete, UP),
                ]
            )
    i, j = len_a, len_b
    alignment = []
    while (i, j) != (0, 0):
        if matrix[i][j][1] == DIAGONAL:
            alignment.insert(0, (a[i - 1], b[j - 1]))
            i -= 1
            j -= 1
        elif matrix[i][j][1] == LEFT:
            alignment.insert(0, (None, b[j - 1]))
            j -= 1
        else:  # UP
            alignment.insert(0, (a[i - 1], None))
            i -= 1
    return alignment


def replace_func(a, b):
    if a == b[0]:
        return 0
    else:
        return -1


def align_text(a, b):
    result = nw_align(a, b, replace_func=replace_func)
    for x, y in result:
        if y is None:
            yield None
        elif x:
            yield y[1]

In [18]:
a = 0
b = 1000
aligned = nw_align(macula_wfp.words[a:a+b], etcbc_wfp.words[a:a+b])

In [19]:
print(aligned)

[('נּוּ', 'אִשָּׁ֖ה'), ('יַ֜יִן', 'מֵ'), ('גַּם', 'אֶ֥רֶץ'), ('הַ', 'מִצְרָֽיִם'), ('לַּ֗יְלָה', 'וַֽ'), ('וּ', 'יְהִי֙'), ('בֹ֨אִי֙', 'בָּ'), ('שִׁכְבִ֣י', ''), ('עִמּ֔', 'עֵ֣ת'), ('וֹ', 'הַ'), ('וּ', 'הִ֔וא'), ('נְחַיֶּ֥ה', 'וַ'), ('מֵ', 'יֹּ֣אמֶר'), ('אָבִ֖י', 'אֲבִימֶ֗לֶךְ'), ('נוּ', 'וּ'), ('זָֽרַע', 'פִיכֹל֙'), ('וַ', 'שַׂר'), ('תַּשְׁקֶ֜יןָ', 'צְבָאֹ֔ו'), ('גַּ֣ם', 'אֶל'), ('בַּ', 'אַבְרָהָ֖ם'), ('', 'לֵ'), ('לַּ֧יְלָה', 'אמֹ֑ר'), ('הַ', 'אֱלֹהִ֣ים'), ('ה֛וּא', 'עִמְּךָ֔'), ('אֶת', 'בְּ'), ('אֲבִי', 'כֹ֥ל'), ('הֶ֖ן', 'אֲשֶׁר'), ('יָ֑יִן', 'אַתָּ֖ה'), ('וַ', 'עֹשֶֽׂה'), ('תָּ֤קָם', 'וְ'), ('הַ', 'עַתָּ֗ה'), ('צְּעִירָה֙', 'הִשָּׁ֨בְעָה'), ('וַ', 'לִּ֤י'), ('תִּשְׁכַּ֣ב', 'בֵֽ'), ('עִמּ֔', 'אלֹהִים֙'), ('וֹ', 'הֵ֔נָּה'), ('וְ', 'אִם'), ('לֹֽא', 'תִּשְׁקֹ֣ר'), ('יָדַ֥ע', 'לִ֔י'), ('בְּ', 'וּ'), ('שִׁכְבָ֖', 'לְ'), ('הּ', 'נִינִ֖י'), ('וּ', 'וּ'), (None, 'לְ'), ('בְ', 'נֶכְדִּ֑י'), ('קֻמָֽ', 'כַּ'), ('הּ', ''), ('וַֽ', 'חֶ֜סֶד'), ('תַּהֲרֶ֛יןָ', 'אֲשֶׁר'), ('שְׁתֵּ֥י', 'עָשִׂ֤יתִי')