In [48]:
import csv
import pandas as pd
import unicodedata
import os


In [262]:
STEP_LEXICON_PATH = 'STEP Data Source/step/step-core-data/src/main/resources/com/tyndalehouse/step/core/data/create/lexicon/lexicon_hebrew.txt'
STEP_CORPORA_PATH = 'STEP Data Source/STEPBible-Data'

STEP_CORPORA_PREFIX = 'TOTHT'
# STEP_CORPORA_HEADER = ['Ref in Heb', 'Eng ref', 'Pointed', 'Accented', 'Morphology', 'Extended Strongs']

HEB_REF_ATTR = 'hebrewRef'
ENG_REF_ATTR = 'englishRef'
TEXT_ATTR = 'text'
TEXT_QERE_ATTR = 'textQere'
TRAILER_ATTR = 'trailer'
GLOSS_ATTR = 'gloss'
SENSE_GLOSS_ATTR = 'senseGloss'
MORPH_ATTR = 'morph'
STRONGS_ATTR = 'strongs'
TRAILER_STRONGS_ATTR = 'trailerStrongs'

STEP_CORPORA_HEADER = [HEB_REF_ATTR, ENG_REF_ATTR, TEXT_ATTR, TEXT_QERE_ATTR, TRAILER_ATTR, GLOSS_ATTR, SENSE_GLOSS_ATTR, MORPH_ATTR, STRONGS_ATTR, TRAILER_STRONGS_ATTR]

STEP_DATA_DEST = 'STEP Data Destination'
STEP_CORPUS = 'TOTHT.csv'
STEP_CORPUS_WITH_QERE = 'TOTHT-with-qere.csv'

MACULA_CORPUS = 'macula-hebrew.tsv'
ETCBC_CORPUS = 'word.csv'

In [234]:
class StepBibleHebrewDataProcessor:

    trailers = ['׃', 'פ', '׀', 'ס', ' פ', '׆', '־']
    # maqef = '־'

    def __init__(self, corpora_files_path, lexicon_file_path):
        
        self.lexicon_file_path = lexicon_file_path
        self.corpora_files_path = corpora_files_path
        self.corpora_files_dict = self.get_corpora_dict()

    def get_corpora_dict(self):

        # Assuming file format 'TOTHT Gen-Deu - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt'
        corpora_dict = {
            'Gen': '',
            'Jos': '',
            'Job': '',
            'Isa': ''
        }

        # Assign the files to the references in the dictionary.
        for file in os.listdir(self.corpora_files_path):

            if STEP_CORPORA_PREFIX in file:

                for ref in corpora_dict.keys():

                    if ref in file:

                        corpora_dict[ref] = file

        return corpora_dict

    # Write all corpora files into a single corpus csv.
    def write_corpora_data(self, with_qere=False):

        rows = []   

        for ref, file in self.corpora_files_dict.items():

            file_path = os.path.join(STEP_CORPORA_PATH, file)
            # Track when we've arrived at the Hebrew content in the file. 
            atData = False
        
            with open(file_path) as f:

                lines = [line.rstrip('\n') for line in f]
                for line_i, line in enumerate(lines):

                    row = line.split('\t')

                    # E.g., "Gen.1.1-01	Gen.1.1-01	בְּרֵאשִׁית	בְּ/רֵאשִׁ֖ית	HR/Ncfsa	H9003=ב=in/H7225=רֵאשִׁית=first_§1_beginning"
                    if len(row) > 1 and ref in row[0]:
                        atData = True

                    if atData:
                        
                        # Don't include qere data.
                        if len(row) == 6 and '.Q' not in row[0]:
                            
                            try:
                                words = row[3].split('/')
                                morph_codes = row[4].split('/')
                                strongs_data = row[5].split('/')

                            except Exception as e: 
                                print(e, line_i, row)

                            word_count = 0

                            data = {}

                            for i, word in enumerate(words): 

                                if word == '':
                                    continue

                                try:

                                    _strongs_data = strongs_data[i].split('=')
                                    strongs_number = _strongs_data[0]
                                    
                                    gloss_data = _strongs_data[-1].split('_')
                                    gloss = gloss_data[0]
                                    sense_gloss = None 
                                    if len(gloss_data) == 2:
                                        sense_gloss = gloss_data[1]
                                    elif len(gloss_data) == 3:
                                        sense_gloss = gloss_data[1] + '.' + gloss_data[2]
                                
                                except Exception as e: 
                                    print(e, line_i, i, word, row, words, morph_codes)

                                # Faulty data:
                                # 2Ki.7.15-14.K	2Ki.7.15-14k	בְּהֵחָפְזָם	בְּ/הֵ/חָפְזָ/ם	HR/VNcc/Sp3mp	H9003=ב=in/H9009#1=ה=the/H2648=חָפַז=to hurry/H9048=Sp3m=they
                                if strongs_number == 'H9009#1':
                                    continue

                                if word in self.trailers:

                                    data[TRAILER_ATTR] += word 
                                    data[TRAILER_STRONGS_ATTR] = strongs_number

                                else:

                                    if data.get(TEXT_ATTR) != None:
                                        rows.append(data)

                                    data = {}

                                    try:

                                        morph = morph_codes[word_count] if len(words) > len(morph_codes) else morph_codes[i]

                                        data[HEB_REF_ATTR] = row[0]
                                        data[ENG_REF_ATTR] = row[1]
                                        data[TEXT_ATTR] = word 
                                        data[TRAILER_ATTR] = '' if i != len(words) - 1 else ' '
                                        data[GLOSS_ATTR] = gloss 
                                        data[SENSE_GLOSS_ATTR] = sense_gloss
                                        data[MORPH_ATTR] = morph
                                        data[STRONGS_ATTR] = strongs_number
                                        data[TRAILER_STRONGS_ATTR] = None

                                        if with_qere and '.K' in row[0]:
                                            qere_data = lines[i+1].split('\t')

                                            # TODO data[]

                                    except Exception as e: 
                                        print(e, line_i, i, word, row)

                                    word_count += 1

                            # if with_qere and '.K' in row[0]:
                            #     data[]
                            
                            rows.append(data)

            print('Complete: ' + file)

        # Write the data.
        df = pd.DataFrame(rows)
        write_file = STEP_CORPUS_WITH_QERE if with_qere else STEP_CORPUS
        save_path = os.path.join(STEP_DATA_DEST, write_file)
        df.to_csv(save_path, sep=',', encoding='utf-8')

In [235]:
a = StepBibleHebrewDataProcessor(STEP_CORPORA_PATH, STEP_LEXICON_PATH)
a.write_corpora_data()
# print(a.corpora_files_dict)

Complete: TOTHT Gen-Deu - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt
Complete: TOTHT Jos-Est - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt
Complete: TOTHT Job-Sng - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt
Complete: TOTHT Isa-Mal - Translators OT Hebrew Tagged text - STEPBible.org CC BY.txt


In [None]:
header = ['Ref in Heb', 'Eng ref', 'Pointed', 'Accented', 'Morphology', 'Extended Strongs']
rows = []
elim = ['׃', '־', 'פ', '׀', 'ס', ' פ', '׆']
sf = STEP_CORPORA_PATH + '/' + a.corpora_files_dict['Isa']

with open(sf) as f:

    atData = False 
    i = 0

    for line in f:
        i += 1
        data = line.strip().split('\t')
        print(data)

        if i > 60:
            break
        if 'Gen.' in data[0]:
            atData = True

        if atData:

            words = data[3].split('/')
            defs = data[5].split('/')

            for i in range(len(words)): 

                if words[i] == ' ':
                    print(data)


# with open('step1.csv', 'w', encoding='UTF8', newline='') as f:
    
#     writer = csv.writer(f)

#     # write the header
#     writer.writerow(header)

#     # write multiple rows
#     writer.writerows(rows)

In [221]:
step = list(csv.reader(open(step_file, 'rt'), delimiter=','))
macula = list(csv.reader(open(macula_file, 'rt'), delimiter='\t'))
# etcbc = list(csv.reader(open(etcbc_file, 'rt'), delimiter=','))


In [56]:
def hebStripped(word):

    normalized = unicodedata.normalize('NFKD', word)

    return ''.join([c for c in normalized if not unicodedata.combining(c)])

In [241]:
def getAlignedIndeces(tableA, rowA, colA, tableB, rowB, colB):

    _rowA = rowA
    _rowB = rowB

    dist = 3
    n = 0 

    while n < dist:

        rowA += n 
        rowB += n

        tries = 3
        i = 1

        while i < tries:
            
            wordA = hebStripped(tableA[rowA][colA])
            wordB = hebStripped(tableB[rowB+i][colB])
            print('A', [wordA, wordB])
            if wordA in wordB and wordA != '':
                print('A', wordA, rowA, rowB+i)
                
                return rowA, rowB+i

            i += 1

        i = 0
        while i < tries:
            
            wordA = hebStripped(tableA[rowA+i][colA])
            wordB = hebStripped(tableB[rowB][colB])
            print('B', [wordA, wordB])
            if wordB in wordA and wordB != '':
                print(wordB, rowA+i, rowB)

                return rowA+i, rowB

            i += 1

        n += 1

    return 0

In [386]:
class WordFileParser:

    crawl_dist = 1
    # According to index == case.
    word_cases = ["Same", "Dif markings", "Dif spelling", "Dif word, next same", "Dif word, next not same"]

    def __init__(self, file:str, word_col:str, ref_col:str):

        self.df = pd.read_csv(
            file, 
            sep=self.__get_sep(file), 
            na_filter=False,
            encoding='utf-8',
            usecols=[word_col, ref_col]
            ).astype(str)
        self.words = self.df[word_col].to_list()
        self.refs = self.df[ref_col].to_list()
        self.i = 0


    def __get_sep(self, file:str) -> str:

        if file.endswith('.csv'):
            return ','
        elif file.endswith('.tsv'):
            return '\t'
        # TODO raise error

    def word(self, i:int=None) -> str:
        if i:
            return self.words[i]
        return self.words[self.i]

    def ref(self, i:int=None) -> str:
        if i:
            return self.refs[i]
        return self.refs[self.i]

    def length(self) -> int:
        return len(self.words)

    def word_comparison(self, wfp:'WordFileParser', new_index:int=0) -> int:
        
        index = max(wfp.i, new_index)
        w1 = self.word()
        w2 = wfp.word(index)

        if w1 == w2:
            return 0

        else:

            _w1 = hebStripped(w1)
            _w2 = hebStripped(w2)

            if _w1 == _w2:
                return 1

            elif len(_w1) > 1 and len(_w2) > 1 and _w1[0] == _w2[0]:

                if _w1[-1] == _w2[-1] or len(_w1) == len(_w2):
                    return 1
                
                else:
                    return 2

            elif self.i + 1 < self.length and wfp.i + 1 < wfp.length \
            and hebStripped(self.word(self.i+1)) == hebStripped(wfp.word(index+1)):
                return 3

            else:
                return 4

            # else:

            #     try:
            #         if hebStripped(self.word(self.i+1)) == hebStripped(wfp.word(index2+1)):
            #             return 3
                    
            #     except:
            #         return 4

    def crawl(self, wfp:'WordFileParser', table:list, again=False):
        
        dist = 0
        _i = wfp.i + 1

        while dist < self.crawl_dist and _i < wfp.length:

            comp = self.word_comparison(wfp, new_index=_i)

            if comp == 0 or (again and comp in [1,2,3]):
                self.update_comparisons(wfp, comp, table, _i)
                return True
                
            _i += 1
            dist += 1
        
        return False
            
    def update_comparisons(self, wfp:'WordFileParser', comp:int, table:list, new_index:int=0):

        while wfp.i < new_index:
            table.append([self.ref(), 'NA', wfp.word(), wfp.ref(), 4])
            wfp.i += 1

        if comp == 0:

            table.append([self.ref(), self.word(), wfp.word(), wfp.ref()])
            self.i += 1
            wfp.i += 1

            return True
            
        elif comp in [1,2,3]:

            table.append([self.ref(), self.word(), wfp.word(), wfp.ref(), comp])
            self.i += 1
            wfp.i += 1

            return True
        
        else:
            return None

    def add_row(self, wfp:'WordFileParser', table:list):
        table.append([self.ref(), self.word(), wfp.word(), wfp.ref(), 5])
        self.i += 1
        wfp.i += 1

In [404]:
class WordFileParser:

    og_crawl_depth = 1
    # According to index == case.
    word_cases = ["Same", "Dif markings", "Dif spelling", "Dif word, next same", "Dif word, next not same"]

    def __init__(self, file:str, word_col:str, ref_col:str, name:str):

        self.df = pd.read_csv(
            file, 
            sep=self.__get_sep(file), 
            na_filter=False,
            encoding='utf-8',
            usecols=[word_col, ref_col]
            ).astype(str)
        self.words = self.df[word_col].to_list()
        self.refs = self.df[ref_col].to_list()
        self.words_output = []
        self.refs_output = []
        self.cases_output = []
        self.i = 0
        self.crawl_depth = self.og_crawl_depth
        self.length = len(self.words)
        self.name = name


    def __get_sep(self, file:str) -> str:

        if file.endswith('.csv'):
            return ','
        elif file.endswith('.tsv'):
            return '\t'
        # TODO raise error


    def word(self, i:int=None) -> str:

        if i:
            return self.words[i]

        return self.words[self.i]


    def ref(self, i:int=None) -> str:

        if i:
            return self.refs[i]

        return self.refs[self.i]


    def reset_crawl_depth(self):

        self.crawl_depth = self.og_crawl_depth


    def update_output_lists(self, values:list):
        
        self.words_output.append(values[0])
        self.refs_output.append(values[1])
        self.cases_output.append(values[2])


    def word_comparison(self, other_wfp:'WordFileParser', new_index:int=0) -> int:
        
        other_index = max(other_wfp.i, new_index)
        word_a = self.word()
        word_b = other_wfp.word(other_index)

        if word_a == word_b:
            return 0

        else:

            word_a_cons = hebStripped(word_a)
            word_b_cons = hebStripped(word_b)

            if word_a_cons == word_b_cons:
                return 1

            elif 0 in [len(word_a_cons), len(word_b_cons)]:
                return 4

            elif len(word_a_cons) > 1 and len(word_b_cons) > 1 and word_a_cons[0] == word_b_cons[0]:

                if word_a_cons[-1] == word_b_cons[-1] or len(word_a_cons) == len(word_b_cons):
                    return 1
                
                else:
                    return 2

            elif self.i + 1 < self.length and other_wfp.i + 1 < other_wfp.length \
            and hebStripped(self.word(self.i+1)) == hebStripped(other_wfp.word(other_index+1)):
                return 3

            else:
                return 4


    def crawl(self, other_wfp:'WordFileParser', again=False):
        
        depth = 0
        runner_index = other_wfp.i + 1

        while depth < self.crawl_depth and runner_index < other_wfp.length:

            comp = self.word_comparison(other_wfp, new_index=runner_index)

            if comp == 0 or (again and comp in [1,2,3]):
                self.update_comparisons(other_wfp, comp, runner_index)
                return True
                
            runner_index += 1
            depth += 1
        
        return False

            
    def update_comparisons(self, other_wfp:'WordFileParser', comp:int, new_index:int=0):

        while other_wfp.i < new_index:
            other_wfp.update_output_lists([other_wfp.word(), other_wfp.ref(), 4])
            self.update_output_lists(['NA', self.ref(), 4])
            other_wfp.i += 1
            
        if comp in range(4):

            other_wfp.update_output_lists([other_wfp.word(), other_wfp.ref(), comp])
            self.update_output_lists([self.word(), self.ref(), comp])
            self.i += 1
            other_wfp.i += 1

            return True
        
        else:
            return None


    def add_row(self, other_wfp:'WordFileParser'):
        
        other_wfp.update_output_lists([other_wfp.word(), other_wfp.ref(), 5])
        self.update_output_lists([self.word(), self.ref(), 5])
        self.i += 1
        other_wfp.i += 1

In [264]:
step_file = os.path.join(STEP_DATA_DEST, STEP_CORPUS)
step_word_col = 'text'
step_ref_col = 'hebrewRef'

macula_file = os.path.join("", MACULA_CORPUS)
macula_word_col = 'text'
macula_ref_col = 'ref'

etcbc_file = os.path.join("", ETCBC_CORPUS)
etcbc_word_col = 'text'
etcbc_ref_col = 'vsIdBHS'

In [405]:
step_wfp = WordFileParser(step_file, step_word_col, step_ref_col, 'step')
macula_wfp = WordFileParser(macula_file, macula_word_col, macula_ref_col, 'macula')
etcbc_wfp = WordFileParser(etcbc_file, etcbc_word_col, etcbc_ref_col, 'etcbc')

In [412]:
def compare_data(wfp1:WordFileParser, wfp2:WordFileParser):
    
    while wfp1.i < wfp1.length and wfp2.i < wfp2.length:
        
        comp = wfp1.word_comparison(wfp2)
        
        if not wfp1.update_comparisons(wfp2, comp):

            while wfp1.crawl_depth <= 3:
                if wfp1.crawl(wfp2):
                    break
                elif wfp2.crawl(wfp1):
                    break
                elif wfp1.crawl(wfp2, again=True):
                    break
                elif wfp2.crawl(wfp1, again=True):
                    break
                wfp1.crawl_depth += 1
                wfp2.crawl_depth += 1
            
            else:
                wfp1.add_row(wfp2)
                # print("ERROR", wfp1.crawl_dist, wfp1.i, wfp1.word(), wfp2.i, wfp2.word())
                # return table

            wfp1.reset_crawl_depth()
            wfp2.reset_crawl_depth()

        if wfp1.i % 50000 < 5:
            print(wfp1.i, wfp1.word(), wfp2.i, wfp2.word())

    table = {
        f"{wfp1.name}Ref": wfp1.refs_output,
        f"{wfp1.name}Text": wfp1.words_output,
        f"{wfp2.name}Text": wfp2.words_output,
        f"{wfp2.name}Ref": wfp2.refs_output,
        "code": wfp2.cases_output,
    }

    return pd.DataFrame(table).astype(str)

In [413]:
# df = compare_data(macula_wfp, step_wfp)
df = compare_data(step_wfp, macula_wfp)

# df = compare_data(etcbc_wfp, macula_wfp)

# df = compare_data(step_wfp, etcbc_wfp)
write_file = 'comp10.csv'
df.to_csv(write_file, encoding='utf-8', index=False)

In [247]:
def compareStepToMacula(mTable, mwCol, sTable, swCol):

    rows = []

    mismatches = {}

    elim = ['׃', '־', 'פ', '׀', 'ס', ' פ', '׆']

    mRow = 1
    sRow = 1
    
    while sRow < len(sTable):

        ref = mTable[mRow][1]
        mw = mTable[mRow][mwCol]
        sw = sTable[sRow][swCol]

        if '.K' in sTable[sRow][0] or sw in elim:
            sRow += 1
            continue
        
        mw = hebStripped(mw)
        sw = hebStripped(sw)

        if ref == 'RUT 3:17!9' and mw == 'אל':
            rows.append([ref, mw, 'NA', 'True'])
            rows.append([ref, 'י', 'NA', 'True'])
            mRow += 1
            sRow += 1
            continue

        if mw != sw:

            print(ref, mRow, sRow, [mw, sw])

            # Likely a difference in suffix marking, like Gen 9:21!7
            if len(mw) == 1 and len(sw) == 1:

                rows.append([ref, mw, sw, '?'])
                mRow += 1
                sRow += 1
            
            # Likely absence of a suffix, like Gen 15:2!3
            elif len(mw) > 2 and len(sw) > 2 and mw[:2] == sw[:2] : 
                
                rows.append([ref, mw, sw, True])

                # Likely dif in ketiv, e.g., Gen 14:2!17 ['צבויים' ,'צבוים']
                if mw[-1] == sw[-1] or '.Q' in sTable[sRow][0]:
                    mRow += 1
                    sRow += 1
                
                elif len(mw) > len(sw):
                    
                    rows.append([
                        ref,
                        'NA',
                        sTable[sRow+1][swCol],
                        True
                    ])

                    mRow += 1
                    sRow += 2
                
                else:

                    rows.append([
                        ref,
                        mTable[mRow+1][mwCol],
                        'NA',
                        True
                    ])

                    mRow += 2
                    sRow += 1

            else:

                newRowIndeces = getAlignedIndeces(mTable, mRow, mwCol, sTable, sRow, swCol)

                print(mRow, sRow, newRowIndeces)
                if not newRowIndeces:
                    return rows
                
                newMRow, newSRow = newRowIndeces

                if newMRow - mRow > newSRow - sRow:

                    while sRow < newSRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, mw, sw, False])

                        mRow += 1
                        sRow += 1

                    while mRow < newMRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, mw, 'NA', False])

                        mRow += 1
                
                else:

                    while mRow < newMRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, mw, sw, False])

                        mRow += 1
                        sRow += 1

                    while sRow < newSRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, 'NA', sw, True])

                        sRow += 1
            
        rows.append([
            mTable[mRow][1],
            mTable[mRow][mwCol],
            sTable[sRow][swCol],
        ])

        mRow += 1
        sRow += 1

    return rows

In [251]:
# TODO

'''
EXO 21:8!7,ל֥,לֹא,False
EXO 21:8!7,NA,יְעָדָ֖,True
EXO 21:8!7,NA,הּ,True
EXO 21:8!7,וֹ,וְ
EXO 21:8!8,יְעָדָ֖,הֶפְדָּ֑,False
EXO 21:8!8,הּ,הּ
EXO 21:8!9,ו,ל,?
EXO 21:8!9,הֶפְדָּ֑,עַ֥ם
EXO 21:8!9,הּ,נָכְרִ֛י,False
EXO 21:8!10,NA,לֹא,True
EXO 21:8!10,לְ,יִמְשֹׁ֥ל
EXO 21:8!10,עַ֥ם,NA,False
EXO 21:8!11,נָכְרִ֛י,NA,False
EXO 21:8!12,לֹא,לְ
EXO 21:8!13,יִמְשֹׁ֥ל,NA,False
EXO 21:8!14,לְ,NA,False
'''

# TODO : check if the next val is equal.
# a[i+1] == b[i+1]
'''
EXO 28:28!5,טבעתי,טבעת,True
EXO 28:28!5,NA,וֹ,True

NUM 23:13!5,לכ,לך,True
NUM 23:13!5,ה,נָּ֨א
NUM 23:13!6,נָּ֨א,NA,False
'''

def compareStepToMacula(mTable, mwCol, sTable, swCol):

    rows = []

    mismatches = {}

    mRow = 1
    sRow = 1

    length = min(len(mTable), len(sTable))
    
    while sRow < length:

        ref = mTable[mRow][1]
        mw = mTable[mRow][mwCol]
        sw = sTable[sRow][swCol]
        
        mw = hebStripped(mw)
        sw = hebStripped(sw)

        if ref == 'RUT 3:17!9' and mw == 'אל':
            rows.append([ref, mw, 'NA', 'True'])
            rows.append([ref, 'י', 'NA', 'True'])
            mRow += 2
            sRow += 0
            continue

        if mw != sw:

            nextIsEqual = hebStripped(mTable[mRow+1][mwCol]) == hebStripped(sTable[sRow+1][swCol])

            if nextIsEqual:
                rows.append([ref, mw, sw, True])
                mRow += 1
                sRow += 1
                continue


            print(ref, mRow, sRow, [mw, sw])

            # Likely a difference in suffix marking, like Gen 9:21!7
            # if len(mw) == 1 and len(sw) == 1:

            #     rows.append([ref, mw, sw, '?'])
            #     mRow += 1
            #     sRow += 1
            
            # Likely absence of a suffix, like Gen 15:2!3
            # if len(mw) > 2 and len(sw) > 2 and mw[:2] == sw[:2]:
            if len(mw) > 0 and len(sw) > 0 and mw[0] == sw[0]:
                 
                rows.append([ref, mw, sw, True])

                # Likely dif in ketiv, e.g., Gen 14:2!17 ['צבויים' ,'צבוים']
                if mw[-1] == sw[-1] or len(mw) == len(sw):
                    mRow += 1
                    sRow += 1
                
                elif len(mw) > len(sw):
                    
                    rows.append([
                        ref,
                        'NA',
                        sTable[sRow+1][swCol],
                        True
                    ])

                    mRow += 1
                    sRow += 2
                
                else:

                    rows.append([
                        ref,
                        mTable[mRow+1][mwCol],
                        'NA',
                        True
                    ])

                    mRow += 2
                    sRow += 1

            else:

                newRowIndeces = getAlignedIndeces(mTable, mRow, mwCol, sTable, sRow, swCol)

                print(mRow, sRow, newRowIndeces)
                if not newRowIndeces:
                    rows.append([ref, mw, sw, "FAILED"])
                    return rows
                
                newMRow, newSRow = newRowIndeces

                if newMRow - mRow > newSRow - sRow:

                    while sRow < newSRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, mw, sw, False])

                        mRow += 1
                        sRow += 1

                    while mRow < newMRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, mw, 'NA', False])

                        mRow += 1
                
                else:

                    while mRow < newMRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, mw, sw, False])

                        mRow += 1
                        sRow += 1

                    while sRow < newSRow:

                        ref = mTable[mRow][1]
                        mw = mTable[mRow][mwCol]
                        sw = sTable[sRow][swCol]

                        rows.append([ref, 'NA', sw, True])

                        sRow += 1
            
        rows.append([
            mTable[mRow][1],
            mTable[mRow][mwCol],
            sTable[sRow][swCol],
        ])

        mRow += 1
        sRow += 1

    return rows


In [252]:
rows = compareStepToMacula(macula, 5, step, 3)

header = ['ref', 'macula', 'step']

with open('comp4.csv', 'w', encoding='UTF8', newline='') as f:
    
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(rows)

GEN 1:5!3 62 62 ['', 'אור']
A ['', 'יום']
A ['', 'ו']
B ['', 'אור']
B ['אור', 'אור']
אור 63 62
62 62 (63, 62)
GEN 1:5!5 67 66 ['', 'חשך']
A ['', 'קרא']
A ['', 'לילה']
B ['', 'חשך']
B ['חשך', 'חשך']
חשך 68 66
67 66 (68, 66)
GEN 1:7!10 110 108 ['', 'רקיע']
A ['', 'ו']
A ['', 'בין']
B ['', 'רקיע']
B ['רקיע', 'רקיע']
רקיע 111 108
110 108 (111, 108)
GEN 1:7!15 120 117 ['', 'רקיע']
A ['', 'ו']
A ['', 'יהי']
B ['', 'רקיע']
B ['רקיע', 'רקיע']
רקיע 121 117
120 117 (121, 117)
GEN 1:8!3 129 125 ['', 'רקיע']
A ['', 'שמים']
A ['', 'ו']
B ['', 'רקיע']
B ['רקיע', 'רקיע']
רקיע 130 125
129 125 (130, 125)
GEN 1:10!3 164 159 ['', 'יבשה']
A ['', 'ארץ']
A ['', 'ו']
B ['', 'יבשה']
B ['יבשה', 'יבשה']
יבשה 165 159
164 159 (165, 159)
GEN 1:18!2 339 333 ['', 'יום']
A ['', 'ו']
A ['', 'ב']
B ['', 'יום']
B ['יום', 'יום']
יום 340 333
339 333 (340, 333)
GEN 1:18!3 343 336 ['', 'לילה']
A ['', 'ו']
A ['', 'ל']
B ['', 'לילה']
B ['לילה', 'לילה']
לילה 344 336
343 336 (344, 336)
GEN 1:22!10 440 432 ['', 'ימים']
A ['', 'ו

In [163]:
header = ['ref', 'macula', 'step']
rows = []
n = 1
x = 0

elim = ['׃', '־', 'פ', '׀', 'ס']
for i, row in enumerate(step[1:]):

    if '.K' in row[0]:
        n -= 1
        continue

    ws = row[3].split('/')
   
    while ws[-1] in elim:
        ws.pop()

    for w in ws:

        j = i + n
        ref = macula[j][1]
        mw = macula[j][5]
        if w == 'אַיֶּֽ':
            rows.append([ref, 'NA', w])
        elif w == 'אֵ֖י':
            rows.append([ref, 'NA', w])
            n += -1
        elif w == 'לָעֹ֔מֶר':
            rows.append([ref, '...', w])
            n += -1
        else:
            rows.append([ref, mw, w])
            if macula[j+1][5] == '':
                rows.append([ref, macula[j+1][5], w])
                n += 1
            if w != ws[-1]:
                n += 1

with open('comp.csv', 'w', encoding='UTF8', newline='') as f:
    
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(rows)

KeyboardInterrupt: 