In [1]:
from tf.app import use
import math
A = use('bhsa', hoist=globals(), checkout='local', version='c')

This is Text-Fabric 9.1.1
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

120 features found and 0 ignored


# Classes

## Classification Classes

In [2]:
"""
A class with data used to assign difficulty weights to passages based
on the lexical frequencies of words in the passage. 

ranks: a list of string categories for lexical frequency ranges
ranges: a 2D list of the numeric range for each rank
weights: a list of the weight penalties assigned per word for each rank
"""
class Rank:

    def __init__(self, name, ranks, ranges, weights):
        self.name = name
        self.ranks = ranks 
        self.ranges = ranges 
        self.weights = weights 

    # Auxiliary function to create a single rank_scale dictionary.
    def get_rank_dict(self):
        rank_dict = {}
        for i in range(len(self.ranks)):
            rank_dict[self.ranks[i]] = {
                'range': self.ranges[i],
                'weight': self.weights[i]
            }
        return rank_dict 

In [3]:
""" 
A class to store different ranking scales. 
"""
class LexRanks:

    # Using 2-elem lists is far faster than searching entire ranges. 
    # Rather than if i in range(), check if i > l[0] and <= l[1].
    # Using this method scales runtime from ~0:04:30 to ~0:00:15.
    _3_ranks = Rank(
        "3_ranks",
        ['Frequent', 'Uncommon', 'Rare'],
        [
            [100, 51000],
            [10, 100],
            [1, 10],
        ],
        [1, 3, 7]
    )
   
    _4_ranks = Rank(
        "4_ranks",
        ['Frequent', 'Medium', 'Uncommon', 'Rare'],
        [
            [100, 51000],
            [50, 100],
            [10, 50],
            [1, 10],
        ],
        [1, 4, 5, 8]
    )

    _5_ranks_a = Rank(
        "5a_ranks",
        ['Frequent', 'Common', 'Medium', 'Uncommon', 'Rare'],
        [
            [500, 51000],
            [250, 500],
            [150, 250],
            [50, 150],
            [1, 50],
        ],
        [1, 2, 3, 5, 8]
    )

    _5_ranks_b = Rank(
        "5b_ranks",
        ['Frequent', 'Common', 'Infrequent', 'Rare', 'Scarce'],
        [
            [200, 51000],
            [100, 200],
            [50, 100],
            [20, 50],
            [1, 20],
        ],
        [1, 1.5, 3, 5, 8]
    )

    _7_ranks = Rank(
        "7_ranks",
        ['Abundant', 'Frequent', 'Common', 'Average', 'Uncommon', 'Rare', 'Scarce'],
        [
            [800, 51000],
            [400, 800],
            [200, 400],
            [100, 200],
            [50, 100],
            [15, 50],
            [1, 15],
        ],
        [1, 1.1, 1.3, 1.7, 3, 5.5, 8.5]
    )

    _9_ranks = Rank(
        "9_ranks",
        ['Abundant', 'Frequent', 'Common', 'Average', 'Uncommon', 'Rare', 'Scarce', 'Scarcer', 'Scarcest'],
        [
            [1000, 51000],
            [400, 1000],
            [200, 400],
            [100, 200],
            [50, 100],
            [30, 50],
            [20, 30],
            [10, 20],
            [1, 10]
        ],
        [1, 1.1, 1.3, 1.7, 3, 5.5, 8, 9, 10]
    )

    _10_ranks = Rank(
        "10_ranks",
        ['Abundant', 'Frequent', 'Common', 'Average', 'Uncommon', 'Rare', 'Rarer', 'Scarce', 'Scarcer', 'Scarcest'],
        [
            [1000, 51000],
            [400, 1000],
            [200, 400],
            [100, 200],
            [50, 100],
            [40, 50],
            [30, 40],
            [20, 30],
            [10, 20],
            [1, 10]
        ],
        [1, 1.1, 1.3, 1.7, 3, 5.5, 7, 8, 9, 10]
    )

    all_ranks = [
        _3_ranks,
        _4_ranks,
        _5_ranks_a,
        _5_ranks_b,
        _7_ranks,
        _9_ranks,
        _10_ranks
    ]

In [4]:
# Include morphology penalties.
class MorphRank:
    other = 8
    base = 0

    stem_map = {
        'hif':2,	#hif‘il
        'hit':3,	#hitpa“el
        'htpo':other,	#hitpo“el
        'hof':5,	#hof‘al
        'nif':3,	#nif‘al
        'piel':2,	#pi“el
        'poal':other,	#po“al
        'poel':other,	#po“el
        'pual':5,	#pu“al
        'qal':base	#qal
    }
    tense_map = {
        'perf':base,	#perfect
        'impf':2,	#imperfect
        'wayq':base,	#wayyiqtol
        'impv':3.5,	#imperative
        'infa':5,	#infinitive (absolute)
        'infc':2,	#infinitive (construct)
        'ptca':3,	#participle
        'ptcp':5,	#participle (passive)
    }   

In [5]:
"""
A class that contains data to help assign difficulty weights to 
any portion of Hebrew text that has more than one word. 
"""
class Classify:
    """ 
    Notes on stop_words_types and other exclusion lists. 

    Most prepositions, articles, and conjunctions don't
    add any meaningul weight to a text and could thus be exlcuded.
    
    Example use:
    words = [w for w in passage if F.sp.v(w) not in stop_words_types]
    
    Note: the only Heb article is 'הַ' with 30,386 occurences. There are some 
    preps and conjs that have few occurences, so I recommend not using
    stop_words_types when weighing passages and using stop_words instead.
    """
    stop_words_types = ['prep', 'art', 'conj']
    # Check if F.voc_lex_utf8.v(word) is in this list. If
    # so it can be excluded since it occurs so often. 
    stop_words = ['אֵת', 'בְּ', 'לְ', 'הַ', 'וְ']
    # If you take verb data into account when weighing a
    # paragraph, these common types could be excluded. 
    easy_vtypes = ['perf', 'impf', 'wayq']
    easy_vstems = ['qal', 'hif', 'nif', 'piel']

## Hebrew Passage Classes

In [6]:
"""
A class that contains a Hebrew passage, consisting of paragraphs 
as marked by a petach (פ) or samech (ס) in the Masoretic Text. If 
a book like Psalms, which lacks paragaph markers, is encountered,
the passages are split at the chapter level. 
"""
class Passage:

    def __init__(self, id):
        self.id = id
        self.verses = [] # a list of verse node ints. 
        self.words = [] # a list of word node ints. 
        self.start_word = 0
        self.end_word = 0
        self.start_verse = 0
        self.end_verse = 0
        self.word_count = 0
        self.weight0 = 0
        self.weight1 = 0
        self.weight2a = 0 # all words denom
        self.weight2b = 0 # unique words
        self.weight3a = 0 # all word denom
        self.weight3b = 0
        self.weight3c = 0 #3a with morph
        self.verb_types_present = set()
        self.verb_stems_present = set()
        self.word_ranks_data = {}
        self.start_ref = ''
        self.end_ref = ''

    paragraph_markers = {'פ': 'open', 'ס': 'closed'}

    # Returns a list of all words present in the passage.
    def get_all_words(self):
        words = []
        for verse in self.verses:
            for word in L.i(verse, otype='word'):
                words.append(word)
        return words

    # Returns a list of all words present in a specified verse in the passage.
    def get_vs_words(self, verse):
        verse_words = [w for w in L.i(verse, otype='word')]
        return verse_words

    # Returns a String of all the text in the passage.
    def get_text(self):
        return T.text(self.verses, fmt='text-orig-full')

    """
    get_vs_weights() returns a dictionary mapping each verse node in 
    the passage to a weight. It takes rank_scale as input, an instance
    of Classify(args).rank_scale() (see notes in Classify for instantiaion).
    """
    def get_vs_weights(self, rank_scale):
        # A dictionary mapping verse nodes to weights.
        verse_weights = {}
        # Iterate over verses in the passage.
        for verse in self.verses:
            verse_weight = 0
            words = self.get_vs_words(verse)
            # Add the scaled word weights to the verse's total weight.
            for word in words:
                if F.voc_lex_utf8.v(word) not in Classify().stop_words:
                    for rank in rank_scale.keys():
                        lex_freq = F.freq_lex.v(word)
                        _range = rank_scale[rank]['range']
                        if lex_freq >= _range[0] and lex_freq < _range[1]:
                            verse_weight += rank_scale[rank]['weight']
            # Add the verse's weight to the dictionary at this verse's key. 
            verse_weight /= len(words)
            verse_weights[verse] = round(verse_weight, 4)
        
        return verse_weights

    # Simply add the freq_lex of each word to weight.
    def get_passage_weight0(self):
        total_weight = 0
        # Iterate over words in the passage.
        for word in self.words:
            if F.voc_lex_utf8.v(word) not in Classify().stop_words:
                # Subtract 10000 to penalize rare words. 
                total_weight += 10000 - F.freq_lex.v(word)
        total_weight /= len(self.words)
        
        return round(total_weight, 4)

    def get_passage_weight1(self, rank_scale):
        total_weight = 0
        # Iterate over words in the passage.
        for word in self.words:
            if F.voc_lex_utf8.v(word) not in Classify().stop_words:
                # Iterate over the ranks present in the rank scale. 
                for rank in rank_scale.keys():
                    lex_freq = F.freq_lex.v(word)
                    _range = rank_scale[rank]['range']
                    if lex_freq >= _range[0] and lex_freq < _range[1]:
                        # Give a half penalty for proper nouns. 
                        if F.sp.v(word) == 'nmpr': # proper noun
                            total_weight += (rank_scale[rank]['weight']) / 2
                        # Give a full penalty for other word types. 
                        else:
                            total_weight += rank_scale[rank]['weight']
        total_weight /= len(self.words)
        
        return round(total_weight, 4)

    # Only penalize once per lexical value.  
    def get_passage_weight2(self, rank_scale, div_all=True):
        total_weight = 0
        unique_words = set()
        # Iterate over words in the passage.
        for word in self.words:
            lex = F.voc_lex_utf8.v(word)
            if lex not in Classify().stop_words and lex not in unique_words:
                # Iterate over the ranks present in the rank scale. 
                for rank in rank_scale.keys():
                    lex_freq = F.freq_lex.v(word)
                    _range = rank_scale[rank]['range']
                    if lex_freq >= _range[0] and lex_freq < _range[1]:
                        # Give a half penalty for proper nouns. 
                        if F.sp.v(word) == 'nmpr': # proper noun
                            total_weight += (rank_scale[rank]['weight']) / 2
                        # Give a full penalty for other word types. 
                        else:
                            total_weight += rank_scale[rank]['weight']
                unique_words.add(lex)
        # Compare using all words as denominator vs. unique words.
        if div_all:
            total_weight /= len(self.words)
        else:
            total_weight /= len(unique_words)
        
        return round(total_weight, 4)

    # Decrease penalty for each occurance. 
    def get_passage_weight3(self, rank_scale, div_all=True, morph=False):
        word_weights = {}
        verb_count = 0
        verb_weight = 0
        min_penalty = 1.7 # min penalty for rare words and proper nouns. 
        # Iterate over words in the passage.
        for word in self.words:
            lex = F.voc_lex_utf8.v(word)
            if lex not in Classify().stop_words:
                # Add partial penalty for reocurring words. 
                if lex in word_weights.keys():
                    # Only gradually decrease penalty for rarer words. 
                    # Decreases by 1 point per occurance. 
                    word_weights[lex]['count'] += 1
                    if F.freq_lex.v(word) < 100:
                        count = word_weights[lex]['count']
                        penalty = word_weights[lex]['penalty']
                        new_weight = penalty - count 
                        added_weight = new_weight if new_weight >= min_penalty else min_penalty
                        word_weights[lex]['weight'] += added_weight
                    else:
                        word_weights[lex]['weight'] += word_weights[lex]['penalty']
                # Add full penalty for the first occurance. 
                else:
                    # Add word to hash table
                    word_weights[lex] = {'count':0, 'weight':0, 'penalty':0}
                    # Iterate over the ranks present in the rank scale. 
                    for rank in rank_scale.keys():
                        lex_freq = F.freq_lex.v(word)
                        _range = rank_scale[rank]['range']
                        if lex_freq >= _range[0] and lex_freq < _range[1]:
                            # Give a half penalty for proper nouns. 
                            _penalty = rank_scale[rank]['weight']
                            if F.sp.v(word) == 'nmpr' and _penalty > min_penalty: # proper noun
                                word_weights[lex]['penalty'] = int(math.ceil(_penalty / 2))
                            # Give a full penalty for other word types. 
                            else:
                                word_weights[lex]['penalty'] = _penalty
                    word_weights[lex]['weight'] += word_weights[lex]['penalty']
                    word_weights[lex]['count'] += 1
                # If we're penalizing for morphology
                if morph:
                    if F.sp.v(word) == 'verb':
                        verb_count += 1
                        verb_weight += MorphRank.stem_map.get(F.vs.v(word),0) + MorphRank.tense_map[F.vt.v(word)]

        # Get the sum of all word weights. 
        total_weight = sum([w for w in [word_weights[k]['weight'] for k in word_weights.keys()]])
        # Compare using all words as denominator vs. unique words.
        if div_all:
            total_weight = total_weight / len(self.words) + (verb_weight / len(self.words))
        else:
            total_weight /= len(word_weights)
        
        return round(total_weight, 4)

In [7]:
print(3//4)

0


In [8]:
"""
A class to store passages. It includes methods to sort the passages
by attributes such as word count, weight, and canonical order. It
also stores the rank scale used to create the passage list. The passages
will be stored in order_sorted by default. 
"""
class Passages:
    
    def __init__(self, passages, rank_scale={}):
        self.rank_scale = rank_scale
        self.order_sorted = passages
        self.word_count_sorted = self.word_count_sort()
        self.weight_sorted0 = self.weight_sort0()
        self.weight_sorted1 = self.weight_sort1()
        self.weight_sorted2a = self.weight_sort2a()
        self.weight_sorted2b = self.weight_sort2b()
        self.weight_sorted3a = self.weight_sort3a()
        self.weight_sorted3b = self.weight_sort3b()
        self.weight_sorted3c = self.weight_sort3c()
    
    def word_count_sort(self):
        return sorted(self.order_sorted, key=lambda p: p.word_count)

    """ For weight sorts I use a dict mapping Passage objects to their rank
    number so that I can then compare all the weight sorts in a dataframe. """
    def weight_sort0(self):
        sorted_list = sorted(self.order_sorted, key=lambda p: p.weight0)
        return {sorted_list[i]:i for i in range(len(sorted_list))}

    def weight_sort1(self):
        sorted_list = sorted(self.order_sorted, key=lambda p: p.weight1)
        return {sorted_list[i]:i for i in range(len(sorted_list))}

    def weight_sort2a(self):
        sorted_list = sorted(self.order_sorted, key=lambda p: p.weight2a)
        return {sorted_list[i]:i for i in range(len(sorted_list))}

    def weight_sort2b(self):
        sorted_list = sorted(self.order_sorted, key=lambda p: p.weight2b)
        return {sorted_list[i]:i for i in range(len(sorted_list))}

    def weight_sort3a(self):
        sorted_list = sorted(self.order_sorted, key=lambda p: p.weight3a)
        return {sorted_list[i]:i for i in range(len(sorted_list))}

    def weight_sort3b(self):
        sorted_list = sorted(self.order_sorted, key=lambda p: p.weight3b)
        return {sorted_list[i]:i for i in range(len(sorted_list))}

    # Morph
    def weight_sort3c(self):
        sorted_list = sorted(self.order_sorted, key=lambda p: p.weight3c)
        return {sorted_list[i]:i for i in range(len(sorted_list))}

    # A function to display the rank scale as a multi-line string. 
    def print_scale(self):
        scale = self.rank_scale
        output_text = ""
        for i, rank in enumerate(scale.ranks):
            _range = scale.ranges[i]
            weight = scale.weights[i]
            output = f"{weight}\t{_range[0]}-{_range[1]} occ"
            output_text += f"{rank}:   \t{output}\n"
        return output_text

# Methods

In [9]:
# A class used to compare mismatches between differently sorted lists. 
class CompareData:

    def update_index(self, dict, key, i):
        if key not in dict:
            dict[key] = i
        else:
            dict[key] = i - dict[key]

    def compare_mismatches(self, list_a, list_b, sorted=False):
        mism = {} 
        for i, (a, b) in enumerate(zip(list_a, list_b)):
            a, b = a.start_ref, b.start_ref
            self.update_index(mism, a, i)
            self.update_index(mism, b, i)
        if sorted:
            return {k: v for k, v in sorted(mism.items(), key=lambda item: item[1], reverse=True)}
        return mism

    def average_mismatch(self, mism):
        return sum(list(mism.values()))/len(mism)

    def max_mismatch(self, mism):
        return sorted(mism.items(), key=lambda item: item[1])[-1]

## Passage Retrieval

In [10]:
"""
Used by get_passages()

Check whether we have reached the end of a valid passage as defined by 
passage_size and paragraph markers. If we have, or if a new book (or new 
chapter in certain books like Psalms), then mark the passage as valid by
setting its value to True. 
In certain cases we will not want to add the current verse to the current
passage, so we will set add_verse to False. 
"""
def valid_passage(passage, verse, passage_size_min, passage_size_max):
    is_valid = False
    add_verse = True
    # Get the string value at the end of the verse. 
    verse_ending = T.text(verse).split()[-1]
    verse_book = L.u(verse, otype='book')[0]
    verse_chapter = L.u(verse, otype='chapter')[0]
    verse_word_count = len(passage.get_vs_words(verse))
    ps_119 = 427315 # node for Psalm 119.
    # Check if we've reached a new book, if yes, end the paragraph.
    if L.u(passage.verses[-1], otype='book')[0] != verse_book:
        is_valid = True 
        add_verse = False
    # Check if the current verse is in the following books.
    # Since they lack enough paragraph markers to make meaningful passages,
    # we create passages at the chapter level. 
    elif verse_book in [T.bookNode('Ruth'), T.bookNode('Jonah'), T.bookNode('Ecclesiastes'), T.bookNode('Psalms')]:
        if verse_chapter != L.u(passage.verses[-1], otype='chapter')[0]:
            is_valid = True 
            add_verse = False
        # If Psalm 119, split up into 8 verse sections to preserve acrostic.
        elif verse_chapter == ps_119:
            if (verse-1) % 8 == 0:
                is_valid = True 
                add_verse = False
    # Otherwise check if we have reached the end of a paragraph. 
    elif verse_ending in passage.paragraph_markers.keys() \
    and len(passage.get_all_words()) + verse_word_count >= passage_size_min:
        is_valid = True

    # TODO Optimize this to create meaningful passages.
    # Or if the passage is too long.
    # ** the len(getAllWords) greatly increases the run time -- we need a way to optimize. 
    # elif len(passage.get_all_words()) + verse_word_count > passage_size_max:
    #     is_valid = True 
    #     add_verse = False

    return is_valid, add_verse

In [13]:
"""
Used by get_passages

Update all of the data of a passage instance once its end verse 
has been reached. 
"""
def update_passage_data(passage, rank_scale):
    # TODO to print nouns in red. 
    passage.word_ranks_data = {k:{'occ':0, 'words':set()} for k in rank_scale.keys()}
    passage.words = passage.get_all_words()

    passage.start_verse = passage.verses[0]
    passage.end_verse = passage.verses[-1]
    passage.start_word = passage.words[0]
    passage.end_word = passage.words[-1]
    passage.word_count = len(passage.words)

    passage.weight0 = passage.get_passage_weight0()
    passage.weight1 = passage.get_passage_weight1(rank_scale)
    passage.weight2a = passage.get_passage_weight2(rank_scale)
    passage.weight2b = passage.get_passage_weight2(rank_scale, div_all=False)
    passage.weight3a = passage.get_passage_weight3(rank_scale)
    passage.weight3b = passage.get_passage_weight3(rank_scale, div_all=False)
    passage.weight3c = passage.get_passage_weight3(rank_scale, morph=True)

    # Update the passage's word frequency and verb data.
    for word in passage.words:
        # Update the types and stems of verbs present. 
        if F.sp.v(word) == 'verb':
            # if F.vt.v(word) not in c.easy_vtypes:
            passage.verb_types_present.add(F.vt.v(word))
            # if F.vt.v(word) not in c.easy_vstems:
            passage.verb_stems_present.add(F.vs.v(word))
        # Update the word_ranks_data dictionary with
        # the words in each category.
        for rank in rank_scale.keys():
            lex_freq = F.freq_lex.v(word)
            _range = rank_scale[rank]['range']
            if lex_freq >= _range[0] and lex_freq < _range[1]:
                passage.word_ranks_data[rank]['occ'] += 1
                passage.word_ranks_data[rank]['words'].add(F.voc_lex_utf8.v(word))
                    
    # Update the passage's start and end reference.
    start_ref = T.sectionFromNode(passage.verses[0])
    end_ref = T.sectionFromNode(passage.verses[-1])
    passage.start_ref = f"{start_ref[0][:6]} {start_ref[1]}:{start_ref[2]}"
    passage.end_ref = f"{end_ref[0][:6]} {end_ref[1]}:{end_ref[2]}"

In [12]:
"""
Iterates over verses in the OT and combines them into passages. 
The function returns a list of Passage objects. 

rank_scale - a dictionary generated by Ranks().rank_scales()
    For example:
        rank_scales = Ranks().rank_scales(Ranks().all_ranks)[index]

start_node - the verse node at which get_passages will begin. 

end_node - the verse node at which get_passages will finish executing.

passage_size - the minimum words in a passage, unless a chapter is shorter
than that (e.g., Psalm 117).
"""
def get_passages(
    rank_scale, 
    start_node=0,
    end_node=len(F.otype.s('verse')), 
    passage_size_min=100,
    passage_size_max=4000
    ):

    # A list of all passages.
    passages = []

    # Initiate the id counter and instantiate the first passage.
    passage_id = 1
    passage = Passage(id=passage_id)

    # Iterate through all verses in the OT. 
    for verse in F.otype.s('verse')[start_node:end_node]:

        # Check if the string is a paragraph marker and if the paragraph is large enough.  
        if len(passage.verses) > 1:
            valid, add_verse = valid_passage(passage, verse, passage_size_min, passage_size_max)
            if valid:

                # We have reached the end of the passage so we update all of its attribute values.
                if add_verse:
                    passage.verses.append(verse)
                update_passage_data(passage, rank_scale)
                passages.append(passage)
                # Begin a new passage. 
                passage_id += 1
                passage = Passage(id=passage_id)

                # The current verse is in a new chapter or book so we append it to the
                # verses of the newly created passage as its start verse. 
                if not add_verse:
                    passage.verses.append(verse)
            # We haven't reached a new passage yet, so add the current verse to its list. 
            else:
                passage.verses.append(verse)

        # Add the first verse to the passage. 
        else:
            passage.verses.append(verse)

    return passages

## Generate Lex Sample Clauses

In [14]:
def get_sample_weight(sample, rank_scale):
    word_weights = {}
    min_penalty = 1.7 # min penalty for rare words and proper nouns. 
    # Iterate over words in the passage.
    words = L.d(sample, otype='word')
    for word in words:
        lex = F.voc_lex_utf8.v(word)
        if lex not in Classify().stop_words:
            # Add partial penalty for reocurring words. 
            if lex in word_weights.keys():
                # Only gradually decrease penalty for rarer words. 
                # Decreases by 1 point per occurance. 
                word_weights[lex]['count'] += 1
                if F.freq_lex.v(word) < 100:
                    count = word_weights[lex]['count']
                    penalty = word_weights[lex]['penalty']
                    new_weight = penalty - count 
                    added_weight = new_weight if new_weight >= min_penalty else min_penalty
                    word_weights[lex]['weight'] += added_weight
                else:
                    word_weights[lex]['weight'] += word_weights[lex]['penalty']
            # Add full penalty for the first occurance. 
            else:
                # Add word to hash table
                word_weights[lex] = {'count':0, 'weight':0, 'penalty':0}
                # Iterate over the ranks present in the rank scale. 
                for rank in rank_scale.keys():
                    lex_freq = F.freq_lex.v(word)
                    _range = rank_scale[rank]['range']
                    if lex_freq >= _range[0] and lex_freq < _range[1]:
                        # Give a half penalty for proper nouns. 
                        _penalty = rank_scale[rank]['weight']
                        if F.sp.v(word) == 'nmpr' and _penalty > min_penalty: # proper noun
                            word_weights[lex]['penalty'] = max((_penalty / 2), min_penalty)
                        # Give a full penalty for other word types. 
                        else:
                            word_weights[lex]['penalty'] = _penalty
                word_weights[lex]['weight'] += word_weights[lex]['penalty']
                word_weights[lex]['count'] += 1

    # Get the sum of all word weights. 
    total_weight = sum([w for w in [word_weights[k]['weight'] for k in word_weights.keys()]])
    # Compare using all words as denominator vs. unique words.
    total_weight = total_weight / len(words) 
    return round(total_weight, 4)

In [15]:
# Get example sentences for each lexeme
lex_samples = {}
rank = LexRanks._10_ranks.get_rank_dict()
for sentence in F.otype.s('sentence'):
    s_weight = get_sample_weight(sentence, rank)
    for w in L.d(sentence, otype='word'):
        if F.freq_lex.v(w) < 5000:
            lex = L.u(w, otype='lex')[0]
            if lex in lex_samples:
                if sentence not in [s[0] for s in lex_samples[lex]]:
                    lex_samples[lex].append([s_weight, sentence])
            else:
                lex_samples[lex] = [[s_weight, sentence]]

In [25]:
data = {"lexId":[], "sentenceId":[], "sentenceWeight":[]}
for k, vals in lex_samples.items():
    if len(vals) > 1:
        for v in vals:
            data['lexId'].append(k)
            data['sentenceId'].append(v[1])
            data['sentenceWeight'].append(v[0])

In [26]:
df = pd.DataFrame(data)
df.to_csv('../../data_files/lex_sentences.csv', sep='\t', encoding='utf-8', index=False)

## Display and Export Passage Data

In [19]:
# A helper function to format printing output. 
# Takes a set of words as input and returns a string.
def format_output(output):
    # Sort the words by alphabetical order.
    output = sorted(list(output))
    formatted = ''
    # Add spacing between the words until the
    # last word is reached. 
    for item in output:
        if item != output[-1]:
            formatted += item + '  '
        else:
            formatted += item
    # Return a string of the formatted words. 
    return formatted

### As DataFrame

In [18]:
import pandas as pd
from IPython.display import display, HTML

In [32]:
# Subtract the start verse node to get verses from 1 .. last verse. 
START_VS = F.otype.s('verse')[0] - 1

def custom_df(passages):
    data = {
        'passageId': [],
        'wordCount': [],
        'weight': [],
        'startVsId': [],
        'endVsId': [],
    }
    for p in passages:
        row = [p.id, p.word_count, p.weight3a, p.start_verse-START_VS, p.end_verse-START_VS]
        for i, key in enumerate(data.keys()):
            data[key].append(row[i])
    df = pd.DataFrame(data)
    return df

In [22]:
import time

def all_ranks():
    start = time.time()
    rank_scales = LexRanks().all_ranks[6]
    all_passage_rankings = []
    for r_s in [rank_scales]:
        rank_scale = r_s.get_rank_dict()
        all_p = Passages(
                passages= get_passages(
                rank_scale, 
                # start_node=0,
                # end_node=100, 
                # passage_size=100
            ),
            rank_scale=r_s)
        all_passage_rankings.append(all_p)
        print(r_s.name, "complete", time.time()-start)
    return all_passage_rankings

In [23]:
all_rankings = all_ranks()

10_ranks complete 31.837373971939087


In [33]:
df = custom_df(all_rankings[0].weight_sorted3a)

In [34]:
# EXPORT
df.to_csv('../../data_files/passages.csv', sep='\t', encoding='utf-8', index=False)