In [2]:
from tf.app import use
from itertools import chain

A = use('bhsa', hoist=globals())

%load_ext autoreload
%autoreload 2

This is Text-Fabric 9.1.1
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

122 features found and 0 ignored


In [3]:
"""
A class that contains data to help assign difficulty weights to 
any portion of Hebrew text that has more than one word. 
"""
class Classify:
    
    def __init__(self):
        # Most prepositions, articles, and conjunctions don't
        # add any meaningul weight to a text. 
        # Example use:
        # words = [w for w in passage if F.sp.v(w) not in stop_words]
        self.stop_words = ['prep', 'art', 'conj']
        # Create frequency ranking buckets for a variety of lexical ranges.
        self.rank_scale = {
            'abundant': list(range(1000, 50000)),
            'frequent': list(range(500, 1000)),
            'common':   list(range(300, 500)),
            'average':  list(range(175, 300)),
            'uncommon': list(range(75, 175)),
            'rare':     list(range(35, 75)),
            'scarce':   list(range(1, 35))
        }
        # Assign exponentially increasing weight penalties according to a word's frequency.
        self.weight_scale = {
            'abundant': 1,
            'frequent': 1.5,
            'common':   2.25,
            'average':  3.5,
            'uncommon': 5,
            'rare':     7.5,
            'scarce':   11
        }

        # Example use of rank_scale and weight_scale:
        # for rank in c.rank_scale.keys():
        #     if F.freq_lex.v(word) in c.rank_scale[rank]:
        #         total_weight += c.weight_scale[rank]

In [10]:
"""
A class that holds a Hebrew paragraph, as marked by a petach (פ) or
samech (ס) in the Masoretic Text. 
"""
class Paragraph:

    def __init__(self, id = 0):
        self.id = id
        self.verses = []
        self.weight = 0
        self._type = ''
        self.types = {'פ': 'open', 'ס': 'closed'}
        self.verb_types_present = set()
        self.verb_stems_present = set()
        self.word_ranks_data = {k:set() for k in Classify().rank_scale.keys()}
        self.start_ref = ''
        self.end_ref = ''

    # Reset the values of all the paragraph's mutable attributes.
    def reset_values(self):
        self.verses = []
        self.score = 0
        self.type = ''
        self.verb_types_present = set()
        self.verb_stems_present = set()
        self.word_ranks_data = {k:set() for k in Classify().rank_scale.keys()}
        self.start_ref = ''
        self.end_ref = ''

    # Returns a list of all words present in the paragraph.
    def get_all_words(self):
        words = []
        for verse in self.verses:
            for word in L.i(verse, otype='word'):
                words.append(word)
        return words

    # Returns a list of all words present in a specified verse in the paragraph.
    def get_vs_words(self, verse):
        words = [w for w in L.i(verse, otype='word')]
        return words

    # Returns a String of all the text in the paragraph.
    def get_text(self):
        return T.text(self.verses, fmt='text-orig-full')

    # Returns a dictionary mapping each verse in the paragraph to a weight.
    def get_vs_weights(self):
        # A class with weight scale data.
        c = Classify()
        # A dictionary mapping verses to weights.
        verse_weights = {}
        # Iterate over verses in the paragraph.
        for verse in self.verses:
            total_weight = 0
            # Create a list with the frequency of each word in the present verse. 
            # Exclude stop words since they add no meaningful weight.
            words = [w for w in self.get_vs_words(verse) 
                    if F.sp.v(w) not in c.stop_words]
            lex_frequencies = [F.freq_lex.v(word) for word in words]
            # Add the scaled word weights to the verse's total weight.
            for lex_freq in lex_frequencies:
                for rank in c.rank_scale.keys():
                    if lex_freq in c.rank_scale[rank]:
                        total_weight += c.weight_scale[rank]
            # Add the verse's weight to the dictionary at this verse's key. 
            total_weight /= len(words)
            verse_weights[verse] = round(total_weight, 4)
        
        return verse_weights

    # Returns the weight of the entire paragraph as a float with 4 decimal places.
    def get_weight(self):
        verse_weights = self.get_vs_weights()
        # Set the denominator to 1 if there are no verses present to
        # avoid dividing by 0. In this case the paragraph weight == 0. 
        denom = 1 if len(self.verses) < 1 else len(self.verses)
        weight = sum(verse_weights.values()) / denom

        return round(weight, 4)

In [27]:
# Set the minimal word requiremnt for a paragraph.
# If a paragraph has less words than specified, it
# will be comined with the following paragraph(s) 
# until len(paragraph.get_all_words()) >= paragraph_size.
paragraph_size = 150

"""
Iterates over all the verses in the OT and combines them
into paragraphs. The function returns a dictionary mapping
paragraph keys (a paragraph's position in the OT) to a 
paragraph object. 
"""
def get_paragaphs():
    # Instantiate our weight classifier. 
    c = Classify()
    # Maps paragraph numbers to paragraph objects. 
    paragraphs = {}

    # Initiate the counter and store the first paragraph.
    paragraph_key = 1
    paragraphs[paragraph_key] = Paragraph(id=paragraph_key)
    paragraph = paragraphs[paragraph_key]
    paragraph.reset_values()

    # Iterate through all verses in the OT. 
    for verse in F.otype.s('verse')[:1000]:
        
        # Get the string value at the end of the verse. 
        ending = T.text(verse).split()[-1]
        
        # Check if the string is a paragraph marker and if the paragraph is large enough.  
        if ending in paragraph.types.keys() \
            and len(paragraph.get_all_words()) >= paragraph_size:

            # We have reached the end of the paragraph so we update all of its attribute values.
            paragraph._type = paragraph.types[ending]
            paragraph.verses.append(verse)
            paragraph.weight = paragraph.get_weight()

            # Update the paragraph's word frequency and verb data.
            for word in paragraph.get_all_words():
                if word not in c.stop_words:
                    # Update the types and stems of verbs present. 
                    if F.sp.v(word) == 'verb':
                        paragraph.verb_types_present.add(F.vt.v(word))
                        paragraph.verb_stems_present.add(F.vs.v(word))
                    # Update the word_ranks_data dictionary with
                    # the words in each category.
                    for rank in list(c.rank_scale.keys()):
                        if F.freq_lex.v(word) in c.rank_scale[rank]:
                            paragraph.word_ranks_data[rank].add(F.lex_utf8.v(word))
                            
            # Update the paragraph's start and end reference.
            start_ref = T.sectionFromNode(paragraph.verses[0])
            end_ref = T.sectionFromNode(paragraph.verses[-1])
            paragraph.start_ref = f"{start_ref[0]} {start_ref[1]}:{start_ref[2]}"
            paragraph.end_ref = f"{end_ref[0]} {end_ref[1]}:{end_ref[2]}"

            # Begin a new paragraph. 
            paragraph_key += 1
            paragraphs[paragraph_key] = Paragraph(id=paragraph_key)
            paragraph = paragraphs[paragraph_key]
            paragraph.reset_values()

        # If we haven't reached the end of the paragraph, simply add the present verse.
        else:
            paragraph.verses.append(verse)

    return paragraphs

In [23]:
# Get a dictionary of all paragraphs in the OT,
# in the order of their occurences.
all_paragraphs = get_paragaphs()

In [28]:
some_paragraphs = get_paragaphs()
paragraphs_sorted = sorted(list(some_paragraphs.values()), key=lambda p: p.weight)

In [24]:
# Get a sorted list of all paragraph objects, ordered 
# by their difficulty weight from easiest to hardest. 
all_paragraphs_sorted = sorted(list(all_paragraphs.values()), key=lambda p: p.weight)

In [25]:
# A helper function to format printing output. 
# Takes a set of words as input and returns a string.
def format_output(output):
    # Sort the words by alphabetical order.
    output = sorted(list(output))
    formatted = ''
    # Add spacing between the words until the
    # last word is reached. 
    for item in output:
        if item != output[-1]:
            formatted += item + '  '
        else:
            formatted += item
    # Return a string of the formatted words. 
    return formatted

In [29]:
c = Classify()
rarest_ranks = list(c.rank_scale.keys())[-3:]
print(len(all_paragraphs_sorted))
for p in paragraphs_sorted[:15]:
    print(f"{p.start_ref} - {p.end_ref}")
    print(f"id: {p.id}")
    print(f"Weight: {p.weight}")
    print(f"Verb types:")
    print(f"[{format_output(p.verb_types_present)}]")
    print(f"Verb stems:")
    print(f"[{format_output(p.verb_stems_present)}]")
    for rank in rarest_ranks:
        print(f"{rank} ({c.rank_scale[rank][0]}-{c.rank_scale[rank][-1]} occ): {len(p.word_ranks_data[rank])} words")
        print(f"[{format_output(p.word_ranks_data[rank])}]")
    print()
    print("Text:")
    print(p.get_text())
    print()

1300
 - 
id: 39
Weight: 0
Verb types:
[]
Verb stems:
[]
uncommon (75-174 occ): 0 words
[]
rare (35-74 occ): 0 words
[]
scarce (1-34 occ): 0 words
[]

Text:
וַיָּבֹא֩ יַעֲקֹ֨ב שָׁלֵ֜ם עִ֣יר שְׁכֶ֗ם אֲשֶׁר֙ בְּאֶ֣רֶץ כְּנַ֔עַן בְּבֹאֹ֖ו מִפַּדַּ֣ן אֲרָ֑ם וַיִּ֖חַן אֶת־פְּנֵ֥י הָעִֽיר׃ וַיִּ֜קֶן אֶת־חֶלְקַ֣ת הַשָּׂדֶ֗ה אֲשֶׁ֤ר נָֽטָה־שָׁם֙ אָהֳלֹ֔ו מִיַּ֥ד בְּנֵֽי־חֲמֹ֖ור אֲבִ֣י שְׁכֶ֑ם בְּמֵאָ֖ה קְשִׂיטָֽה׃ וַיַּצֶּב־שָׁ֖ם מִזְבֵּ֑חַ וַיִּ֨קְרָא־לֹ֔ו אֵ֖ל אֱלֹהֵ֥י יִשְׂרָאֵֽל׃ ס וַתֵּצֵ֤א דִינָה֙ בַּת־לֵאָ֔ה אֲשֶׁ֥ר יָלְדָ֖ה לְיַעֲקֹ֑ב לִרְאֹ֖ות בִּבְנֹ֥ות הָאָֽרֶץ׃ וַיַּ֨רְא אֹתָ֜הּ שְׁכֶ֧ם בֶּן־חֲמֹ֛ור הַֽחִוִּ֖י נְשִׂ֣יא הָאָ֑רֶץ וַיִּקַּ֥ח אֹתָ֛הּ וַיִּשְׁכַּ֥ב אֹתָ֖הּ וַיְעַנֶּֽהָ׃ וַתִּדְבַּ֣ק נַפְשֹׁ֔ו בְּדִינָ֖ה בַּֽת־יַעֲקֹ֑ב וַיֶּֽאֱהַב֙ אֶת־הַֽנַּעֲרָ֔ וַיְדַבֵּ֖ר עַל־לֵ֥ב הַֽנַּעֲרָֽ׃ וַיֹּ֣אמֶר שְׁכֶ֔ם אֶל־חֲמֹ֥ור אָבִ֖יו לֵאמֹ֑ר קַֽח־לִ֛י אֶת־הַיַּלְדָּ֥ה הַזֹּ֖את לְאִשָּֽׁה׃ וְיַעֲקֹ֣ב שָׁמַ֗ע כִּ֤י טִמֵּא֙ אֶת־דִּינָ֣ה בִתֹּ֔ו וּבָנָ֛יו הָי֥וּ אֶת־מִקְנֵ֖הוּ בַּשָּׂדֶ֑ה ו