In [1]:
from tf.app import use

A = use('bhsa', hoist=globals())

This is Text-Fabric 9.1.1
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

122 features found and 0 ignored


In [2]:
class Ranks:
    ranks1 = ['Frequent', 'Common', 'Medium', 'Uncommon', 'Rare']
    ranks2 = ['Abundant', 'Frequent', 'Common', 'Average', 'Uncommon', 'Rare', 'Scarce']
    ranks3 = ['Frequent', 'Medium', 'Uncommon', 'Rare']
    ranks4 = ['Frequent', 'Uncommon', 'Rare']
    ranks5 = ['Frequent', 'Common', 'Infrequent', 'Rare', 'Scarce']
    ranks6 = ['Abundant', 'Frequent', 'Common', 'Average', 'Uncommon', 'Rare', 'Rarer', 'Scarce', 'Scarcer', 'Scarcest']

    # Using 2-elem lists is far faster than searching ranges. 
    # Rather than if i in range(), check if i > l[0] and <= l[1].
    # Using this method scales runtime from ~0:04:30 to ~0:00:15.
    ranges1 = [
        [500, 51000],
        [250, 500],
        [150, 250],
        [50, 150],
        [1, 50],
    ]
    weights1 = [1, 2, 3, 5, 8]

    ranges2 = [
        [800, 51000],
        [400, 800],
        [200, 400],
        [100, 200],
        [50, 100],
        [15, 50],
        [1, 15],
    ]
    weights2 = [1, 1.1, 1.3, 1.7, 3, 5.5, 8.5]

    ranges3 = [
        [100, 51000],
        [50, 100],
        [10, 50],
        [1, 10],
    ]
    weights3 = [1, 4, 5, 8]

    ranges4 = [
        [100, 51000],
        [10, 100],
        [1, 10],
    ]
    weights4 = [1, 3, 7]

    ranges5 = [
        [200, 51000],
        [100, 200],
        [50, 100],
        [20, 50],
        [1, 20],
    ]
    weights5 = [1, 1.5, 3, 5, 8]

    ranges6 = [
        [1000, 51000],
        [400, 1000],
        [200, 400],
        [100, 200],
        [50, 100],
        [40, 50],
        [30, 40],
        [20, 30],
        [10, 20],
        [1, 10]
    ]
    weights6 = [1, 1.1, 1.3, 1.7, 3, 5.5, 7, 8, 9, 10]

    all_ranks = [
        ranks1, 
        ranks2, 
        ranks3, 
        ranks4, 
        ranks5,
        ranks6,
    ]
    all_ranges = [
        ranges1, 
        ranges2, 
        ranges3, 
        ranges4, 
        ranges5,
        ranges6, 
    ]
    all_weights = [
        weights1, 
        weights2, 
        weights3, 
        weights4, 
        weights5,
        weights6,
    ]

In [3]:
"""
A class that contains data to help assign difficulty weights to 
any portion of Hebrew text that has more than one word. 
"""
class Classify:
    
    # Pass in your choice of ranges and weights from the Ranks class.
    def __init__(self, ranks={}, ranges={}, weights={}):
        self.ranks = ranks
        self.ranges = ranges 
        self.weights = weights

    """ 
    Notes on stop_words_types and other exclusion lists. 

    Most prepositions, articles, and conjunctions don't
    add any meaningul weight to a text and could thus be exlcuded.
    
    Example use:
    words = [w for w in passage if F.sp.v(w) not in stop_words_types]
    
    Note: the only Heb article is 'הַ' with 30,386 occurences. There are some 
    preps and conjs that have few occurences, so I recommend not using
    stop_words_types when weighing passages and using stop_words instead.
    """
    stop_words_types = ['prep', 'art', 'conj']
    # Check if F.voc_lex_utf8.v(word) is in this list. If
    # so it can be excluded since it occurs so often. 
    stop_words = ['אֵת', 'בְּ', 'לְ', 'הַ', 'וְ']
    # If you take verb data into account when weighing a
    # paragraph, these common types could be excluded. 
    easy_vtypes = ['perf', 'impf', 'wayq']
    easy_vstems = ['qal', 'hif', 'nif', 'piel']

    """
    rank_scale() returns a dict containing frequency ranking 
    buckets for a variety of lexical ranges.
    
    Example use of rank_scale():

    r = Ranks()
    rank_scale = Classify(r.ranks1, r.ranges1, r.weights1).rank_scale()
    for rank in rank_scale.keys():
        lex_freq = F.freq_lex.v(word)
        range = rank_scale[rank]['range']
        if lex_freq >= range[0] and lex_freq < range[1]:
            total_weight += rank_scale[rank]['weight']
    """
    # Create frequency ranking buckets for a variety of lexical ranges.
    def rank_scale(self):
        rank_scale = {}
        for i in range(len(self.ranks)):
            rank_scale[self.ranks[i]] = {
                'range': self.ranges[i],
                'weight': self.weights[i]
            }
        return rank_scale 

In [4]:
"""
A class that contains a Hebrew passage, consisting of paragraphs 
as marked by a petach (פ) or samech (ס) in the Masoretic Text. 
"""
class Passage:

    def __init__(self, id):
        self.id = id
        self.verses = [] # a list of verse node ints. 
        self.words = [] # a list of word node ints. 
        self.start_word = 0
        self.end_word = 0
        self.word_count = 0
        self.weight1 = 0
        self.weight2a = 0 # all words denom
        self.weight2b = 0 # unique words
        self.weight3a = 0 # all word denom
        self.weight3b = 0
        self.paragraph_markers = {'פ': 'open', 'ס': 'closed'}
        self.verb_types_present = set()
        self.verb_stems_present = set()
        self.word_ranks_data = {}
        self.start_ref = ''
        self.end_ref = ''

    # Reset the values of all the passage's mutable attributes.
    def reset_values(self):
        self.verses = []
        self.words = []
        self.start_word = 0
        self.end_word = 0
        self.word_count = 0
        self.weight1 = 0
        self.weight2a = 0
        self.weight2b = 0 
        self.weight3a = 0 
        self.weight3b = 0
        self.verb_types_present = set()
        self.verb_stems_present = set()
        self.word_ranks_data = {}
        self.start_ref = ''
        self.end_ref = ''

    # Returns a list of all words present in the passage.
    def get_all_words(self):
        words = []
        for verse in self.verses:
            for word in L.i(verse, otype='word'):
                words.append(word)
        return words

    # Returns a list of all words present in a specified verse in the passage.
    def get_vs_words(verse):
        verse_words = [w for w in L.i(verse, otype='word')]
        return verse_words

    # Returns a String of all the text in the passage.
    def get_text(self):
        return T.text(self.verses, fmt='text-orig-full')

    """
    get_vs_weights() returns a dictionary mapping each verse node in 
    the passage to a weight. It takes rank_scale as input, an instance
    of Classify(args).rank_scale() (see notes in Classify for instantiaion).
    """
    def get_vs_weights(self, rank_scale):
        # A dictionary mapping verse nodes to weights.
        verse_weights = {}
        # Iterate over verses in the passage.
        for verse in self.verses:
            verse_weight = 0
            words = self.get_vs_words(verse)
            # Add the scaled word weights to the verse's total weight.
            for word in words:
                if F.voc_lex_utf8.v(word) not in Classify().stop_words:
                    for rank in rank_scale.keys():
                        lex_freq = F.freq_lex.v(word)
                        range = rank_scale[rank]['range']
                        if lex_freq >= range[0] and lex_freq < range[1]:
                            verse_weight += rank_scale[rank]['weight']
            # Add the verse's weight to the dictionary at this verse's key. 
            verse_weight /= len(words)
            verse_weights[verse] = round(verse_weight, 4)
        
        return verse_weights

    def get_passage_weight1(self, rank_scale):
        total_weight = 0
        # Iterate over words in the passage.
        for word in self.words:
            if F.voc_lex_utf8.v(word) not in Classify().stop_words:
                # Iterate over the ranks present in the rank scale. 
                for rank in rank_scale.keys():
                    lex_freq = F.freq_lex.v(word)
                    range = rank_scale[rank]['range']
                    if lex_freq >= range[0] and lex_freq < range[1]:
                        # Give a half penalty for proper nouns. 
                        if F.sp.v(word) == 'nmpr': # proper noun
                            total_weight += (rank_scale[rank]['weight']) / 2
                        # Give a full penalty for other word types. 
                        else:
                            total_weight += rank_scale[rank]['weight']
        total_weight /= len(self.words)
        
        return round(total_weight, 4)

    # Only penalize once per lexical value.  
    def get_passage_weight2(self, rank_scale, div_all=True):
        total_weight = 0
        unique_words = set()
        # Iterate over words in the passage.
        for word in self.words:
            lex = F.voc_lex_utf8.v(word)
            if lex not in Classify().stop_words and lex not in unique_words:
                # Iterate over the ranks present in the rank scale. 
                for rank in rank_scale.keys():
                    lex_freq = F.freq_lex.v(word)
                    range = rank_scale[rank]['range']
                    if lex_freq >= range[0] and lex_freq < range[1]:
                        # Give a half penalty for proper nouns. 
                        if F.sp.v(word) == 'nmpr': # proper noun
                            total_weight += (rank_scale[rank]['weight']) / 2
                        # Give a full penalty for other word types. 
                        else:
                            total_weight += rank_scale[rank]['weight']
                unique_words.add(lex)
        # Compare using all words as denominator vs. unique words.
        if div_all:
            total_weight /= len(self.words)
        else:
            total_weight /= len(unique_words)
        
        return round(total_weight, 4)

    # Decrease penalty for each occurance. 
    def get_passage_weight3(self, rank_scale, div_all=True):
        word_weights = {}
        # Iterate over words in the passage.
        for word in self.words:
            lex = F.voc_lex_utf8.v(word)
            if lex not in Classify().stop_words:
                # Add partial penalty for reocurring words. 
                if lex in word_weights.keys():
                    # Only gradually decrease penalty for rarer words. 
                    # Decreases by 1 point per occurance. 
                    word_weights[lex]['count'] += 1
                    if F.freq_lex.v(word) < 100:
                        count = word_weights[lex]['count']
                        penalty = word_weights[lex]['penalty']
                        new_weight = penalty - count 
                        added_weight = new_weight if new_weight >= 1 else 1
                        word_weights[lex]['weight'] += added_weight
                    else:
                        word_weights[lex]['weight'] += word_weights[lex]['penalty']
                # Add full penalty for the first occurance. 
                else:
                    # Add word to hash table
                    word_weights[lex] = {'count':0, 'weight':0, 'penalty':0}
                    # Iterate over the ranks present in the rank scale. 
                    for rank in rank_scale.keys():
                        lex_freq = F.freq_lex.v(word)
                        range = rank_scale[rank]['range']
                        if lex_freq >= range[0] and lex_freq < range[1]:
                            # Give a half penalty for proper nouns. 
                            if F.sp.v(word) == 'nmpr': # proper noun
                                word_weights[lex]['penalty'] = (rank_scale[rank]['weight']) / 2
                            # Give a full penalty for other word types. 
                            else:
                                word_weights[lex]['penalty'] = rank_scale[rank]['weight']
                    word_weights[lex]['weight'] += word_weights[lex]['penalty']
                    word_weights[lex]['count'] += 1
        # Get the sum of all word weights. 
        total_weight = sum([w for w in [word_weights[k]['weight'] for k in word_weights.keys()]])
        # Compare using all words as denominator vs. unique words.
        if div_all:
            total_weight /= len(self.words)
        else:
            total_weight /= len(word_weights)
        
        return round(total_weight, 4)

In [5]:
class AllPassages:
    
   def __init__(self, rank_scale={}):
      self.rank_scale = rank_scale
      self.order_sorted = []
      self.word_count_sorted = []
      self.weight_sorted1 = {}
      self.weight_sorted2a = {}
      self.weight_sorted2b = {}
      self.weight_sorted3a = {}
      self.weight_sorted3b = {}
   
   def word_count_sort(self):
      return sorted(self.order_sorted, key=lambda p: p.word_count)

   def weight_sort1(self):
      sorted_list = sorted(self.order_sorted, key=lambda p: p.weight1)
      return {sorted_list[i]:i for i in range(len(sorted_list))}

   def weight_sort2a(self):
      sorted_list = sorted(self.order_sorted, key=lambda p: p.weight2a)
      return {sorted_list[i]:i for i in range(len(sorted_list))}

   def weight_sort2b(self):
      sorted_list = sorted(self.order_sorted, key=lambda p: p.weight2b)
      return {sorted_list[i]:i for i in range(len(sorted_list))}

   def weight_sort3a(self):
      sorted_list = sorted(self.order_sorted, key=lambda p: p.weight3a)
      return {sorted_list[i]:i for i in range(len(sorted_list))}

   def weight_sort3b(self):
      sorted_list = sorted(self.order_sorted, key=lambda p: p.weight3b)
      return {sorted_list[i]:i for i in range(len(sorted_list))}

   def print_scale(self):
      scale = self.rank_scale
      output_text = ""
      for rank in scale.keys():
         range = scale[rank]['range'] 
         weight = scale[rank]['weight']
         output = f"w{weight} for {range[0]}-{range[1]} occ"
         output_text += f"{rank}: {output}\n"
      return output_text

In [22]:
"""
Iterates over all the verses in the OT and combines them
into passages. The function returns a list of Passage objects. 

Takes rank_scale, an instance of Classify().rank_scale().
For example:
    r = Ranks()
    rank_scale = Classify(r.ranks1, r.ranges1, r.weights1).rank_scale()
"""
# Set the minimal word requiremnt for a passage.
# If a passage has less words than specified, it
# will be comined with the following passage(s) 
# until len(passage.get_all_words()) >= passage_size.
def get_passages(
    rank_scale, 
    end_node=len(F.otype.s('verse')), 
    passage_size=100
    ):

    # A list of all passages.
    passages = []

    # Initiate the id counter and instantiate the first passage.
    passage_id = 1
    passage = Passage(id=passage_id)

    # Iterate through all verses in the OT. 
    for verse in F.otype.s('verse')[:end_node]:

        # Check if the string is a paragraph marker and if the paragraph is large enough.  
        if len(passage.verses) > 1:
            valid, add_verse = valid_passage(passage, verse, passage_size)
            if valid:

                # We have reached the end of the passage so we update all of its attribute values.
                if add_verse:
                    passage.verses.append(verse)
                update_passage_data(passage, rank_scale)
                passages.append(passage)

                # Begin a new passage. 
                passage_id += 1
                passage = Passage(id=passage_id)
                # passage.reset_values()
                if not add_verse:
                    passage.verses.append(verse)
            
            else:
                passage.verses.append(verse)

        # If we haven't reached the end of the passage, simply add the present verse.
        else:
            passage.verses.append(verse)

    return passages

def valid_passage(passage, verse, passage_size):
    is_valid = False
    add_verse = True
    # Get the string value at the end of the verse. 
    verse_ending = T.text(verse).split()[-1]
    verse_book = L.u(verse, otype='book')
    ps_119 = 427315 # node for Psalm 119
    # Check if we've reached a new book, if yes, end the paragraph.
    if L.u(passage.verses[-1], otype='book') != verse_book:
        is_valid = True 
        add_verse = False
    # Check if it's in Psalms
    elif verse_book[0] == T.bookNode('Psalms'):
        # If Psalm 119, split up into 8 verse sections
        if L.u(verse, otype='chapter')[0] == ps_119:
            if (verse-1) % 8 == 0:
                is_valid = True 
                add_verse = False
        # See if a new chapter is reached
        elif L.u(verse, otype='chapter') != L.u(passage.verses[-1], otype='chapter'):
            is_valid = True 
            add_verse = False
    # Check if in the following books
    elif verse_book[0] in [T.bookNode('Ruth'), T.bookNode('Jonah'), T.bookNode('Ecclesiastes')]:
        if L.u(verse, otype='chapter') != L.u(passage.verses[-1], otype='chapter'):
            is_valid = True 
            add_verse = False
    # A normal paragraph
    elif verse_ending in passage.paragraph_markers.keys() \
    and len(passage.get_all_words()) >= passage_size:
        is_valid = True 

    return is_valid, add_verse

def update_passage_data(passage, rank_scale):
    passage.word_ranks_data = {k:{'occ':0, 'words':set()} for k in rank_scale.keys()}
    passage.words = passage.get_all_words()

    passage.start_word = passage.words[0]
    passage.end_word = passage.words[-1]
    passage.word_count = len(passage.words)

    passage.weight1 = passage.get_passage_weight1(rank_scale)
    passage.weight2a = passage.get_passage_weight2(rank_scale)
    passage.weight2b = passage.get_passage_weight2(rank_scale, div_all=False)
    passage.weight3a = passage.get_passage_weight3(rank_scale)
    passage.weight3b = passage.get_passage_weight3(rank_scale, div_all=False)

    # Update the passage's word frequency and verb data.
    for word in passage.words:
        # Update the types and stems of verbs present. 
        # if F.sp.v(word) == 'verb':
        #     if F.vt.v(word) not in c.easy_vtypes:
        #         passage.verb_types_present.add(F.vt.v(word))
        #     if F.vt.v(word) not in c.easy_vstems:
        #         passage.verb_stems_present.add(F.vs.v(word))
        # Update the word_ranks_data dictionary with
        # the words in each category.
        for rank in rank_scale.keys():
            lex_freq = F.freq_lex.v(word)
            range = rank_scale[rank]['range']
            if lex_freq >= range[0] and lex_freq < range[1]:
                passage.word_ranks_data[rank]['occ'] += 1
                passage.word_ranks_data[rank]['words'].add(F.voc_lex_utf8.v(word))
                    
    # Update the passage's start and end reference.
    start_ref = T.sectionFromNode(passage.verses[0])
    end_ref = T.sectionFromNode(passage.verses[-1])
    passage.start_ref = f"{start_ref[0][:6]} {start_ref[1]}:{start_ref[2]}"
    passage.end_ref = f"{end_ref[0][:6]} {end_ref[1]}:{end_ref[2]}"

In [141]:
print(T.nodeFromSection("Ruth 1 1"))
# print(T.nodeFromHeading((('book', 'Ruth'), ('chapter', 1), ('verse', 1))))
print(T.bookNode('Ruth'))

None
426620


In [7]:
# # Get a dictionary of all passages in the OT,
# # in the order of their occurences.
# r = Ranks()
# rank_scale = Classify(r.ranks2, r.ranges2, r.weights2).rank_scale()
# # end_node = len(F.otype.s('verse'))
# all_passages = AllPassages(rank_scale=rank_scale)
# all_passages.order_sorted = get_passages(rank_scale)
# all_passages.weight_sorted = all_passages.weight_sort()

In [7]:
# A helper function to format printing output. 
# Takes a set of words as input and returns a string.
def format_output(output):
    # Sort the words by alphabetical order.
    output = sorted(list(output))
    formatted = ''
    # Add spacing between the words until the
    # last word is reached. 
    for item in output:
        if item != output[-1]:
            formatted += item + '  '
        else:
            formatted += item
    # Return a string of the formatted words. 
    return formatted

In [38]:
# # Output data to CSV
# import csv

# def to_csv(all_passages, _file):
#     rank_scale = all_passages.rank_scale
#     lim = -3
#     rr = list(rank_scale.keys())[lim:] # rarest ranks
#     with open(_file, mode='w', encoding='utf-8') as file:
#         writer = csv.writer(file, delimiter=',')
#         writer.writerow([
#             all_passages.print_scale()
#         ])
#         writer.writerow([
#             'Reference', 
#             'Weight', 
#             'Words',
#             # 'Verb Types', 
#             # 'Verb Stems', 
#             f"{rr[0]} ({rank_scale[rr[0]]['range'][0]}-{rank_scale[rr[0]]['range'][1]}):",
#             f"{rr[1]} ({rank_scale[rr[1]]['range'][0]}-{rank_scale[rr[1]]['range'][1]}):",
#             f"{rr[2]} ({rank_scale[rr[2]]['range'][0]}-{rank_scale[rr[2]]['range'][1]}):",
#         ])
#         for p in all_passages.weight_sorted1:
#             writer.writerow([
#                 f"{p.start_ref} - {p.end_ref}", 
#                 p.weight1, 
#                 p.word_count,
#                 # format_output(p.verb_types_present), 
#                 # format_output(p.verb_stems_present),
#                 f"{p.word_ranks_data[rr[0]]['occ']}  {format_output(p.word_ranks_data[rr[0]]['words'])}",
#                 f"{p.word_ranks_data[rr[1]]['occ']}  {format_output(p.word_ranks_data[rr[1]]['words'])}",
#                 f"{p.word_ranks_data[rr[2]]['occ']}  {format_output(p.word_ranks_data[rr[2]]['words'])}"
#             ])

In [8]:
# Output data to CSV
import csv

def to_csv(all_passages, _file):
    rank_scale = all_passages.rank_scale
    lim = -5
    rr = list(rank_scale.keys())[lim:] # rarest ranks
    with open(_file, mode='w', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow([
            all_passages.print_scale()
        ])
        headings = [
            'Rank',
            'Reference', 
            'Weight',
            'Words',
        ]
        for i in range(len(rr)):
            headings.append(
                f"{rr[i]} ({rank_scale[rr[i]]['range'][0]}-{rank_scale[rr[i]]['range'][1]}):"
            )        
        writer.writerow(headings)
        for p in all_passages.weight_sorted1:
            row = [
                all_passages.weight_sorted1[p]+1,
                f"{p.start_ref} - {p.end_ref}", 
                p.weight1, 
                p.word_count,
            ]
            for i in range(len(rr)):
                row.append(
                    f"{p.word_ranks_data[rr[i]]['occ']}  {format_output(p.word_ranks_data[rr[i]]['words'])}",
                )
            writer.writerow(row)

In [9]:
from collections import OrderedDict
import xlsxwriter


data = {"1":["xyz",""],"2":["abc","def"],"3":["zzz",""]}

# Use an OrderedDict to maintain the order of the columns
data = OrderedDict((k,data.get(k)) for k in sorted(data.keys()))

# Open an Excel workbook
workbook = xlsxwriter.Workbook('dict_to_excel.xlsx')

# Set up a format
book_format = workbook.add_format(properties={'bold': True, 'font_color': 'red'})

# Create a sheet
worksheet = workbook.add_worksheet('dict_data')

# Write the headers
for col_num, header in enumerate(data.keys()):
    worksheet.write(0,col_num, int(header))

# Save the data from the OrderedDict into the excel sheet
for row_num,row_data in enumerate(zip(*data.values())):
    for col_num, cell_data in enumerate(row_data):
        if cell_data ==  "xyz":
            worksheet.write(row_num+1, col_num, cell_data, book_format)
        else:
            worksheet.write(row_num+1, col_num, cell_data)

# Close the workbook
workbook.close()

In [13]:
import time

files = [
    '../output/rank1.csv',
    '../output/rank2.csv',
    '../output/rank3.csv',
    '../output/rank4.csv',
    '../output/rank5.csv',
    '../output/rank6.csv'
]

def all_ranks():
    start = time.time()
    r = Ranks()
    all_passage_rankings = []
    # for i in range(len(r.all_ranks)):
    for i in [5]:
        rank_scale = Classify(r.all_ranks[i], r.all_ranges[i], r.all_weights[i]).rank_scale()
        all_p = AllPassages(rank_scale=rank_scale)
        all_p.order_sorted = get_passages(rank_scale, end_node=len(F.otype.s('verse')), passage_size=100)
        all_p.word_count_sorted = all_p.word_count_sort()
        all_p.weight_sorted1 = all_p.weight_sort1()
        all_p.weight_sorted2a = all_p.weight_sort2a()
        all_p.weight_sorted2b = all_p.weight_sort2b()
        all_p.weight_sorted3a = all_p.weight_sort3a()
        all_p.weight_sorted3b = all_p.weight_sort3b()
        all_passage_rankings.append(all_p)
        # file = files[i]
        # to_csv(all_p, file)
        end = time.time()
        print(i+1, "complete", end-start)
    return all_passage_rankings

In [26]:
print(len(1))

TypeError: object of type 'int' has no len()

In [23]:
rankings = all_ranks()
# to_csv(rankings[0], files[0])

6 complete 15.315574169158936


In [15]:
import pandas as pd
from IPython.display import display, HTML

def create_df(passages, display_order):
    rank_scale = passages.rank_scale
    lim = -3
    rr = list(rank_scale.keys())[lim:] # rarest ranks
    df_cols = [
        'Reference',
        'Words',
        'Weight', 
        'R2a',
        'R2b',
        'R3a',
        'R3b',
        f"{rr[0]} ({rank_scale[rr[0]]['range'][0]}-{rank_scale[rr[0]]['range'][1]}):",
        f"{rr[1]} ({rank_scale[rr[1]]['range'][0]}-{rank_scale[rr[1]]['range'][1]}):",
        f"{rr[2]} ({rank_scale[rr[2]]['range'][0]}-{rank_scale[rr[2]]['range'][1]}):",
    ]
    row_list = []
    for p in display_order:
        row_dict = {}
        row = [
            f"{p.start_ref} - {p.end_ref}", 
            p.word_count, 
            p.weight1,
            # Go into the dict to get the ranking for that weight
            passages.weight_sorted2a[p]+1,
            passages.weight_sorted2b[p]+1,
            passages.weight_sorted3a[p]+1,
            passages.weight_sorted3b[p]+1,
            f"{p.word_ranks_data[rr[0]]['occ']}  {format_output(p.word_ranks_data[rr[0]]['words'])}",
            f"{p.word_ranks_data[rr[1]]['occ']}  {format_output(p.word_ranks_data[rr[1]]['words'])}",
            f"{p.word_ranks_data[rr[2]]['occ']}  {format_output(p.word_ranks_data[rr[2]]['words'])}"
        ]
        for i in range(len(row)):
            row_dict[df_cols[i]] = row[i]
        row_list.append(row_dict)
        # print(row_dict)
    # print(row_list)
    df = pd.DataFrame(row_list, columns=df_cols)
    return df

In [24]:
to_csv(rankings[0], '../output/rank6.csv')

In [25]:
passages = rankings[0]
display_order = passages.word_count_sorted
df = create_df(passages, display_order)
display(HTML(
    df.head(20).to_html(index=True))
)

Unnamed: 0,Reference,Words,Weight,R2a,R2b,R3a,R3b,Scarce (20-30):,Scarcer (10-20):,Scarcest (1-10):
0,Psalms 117:1 - Psalms 117:2,20,2.2175,1570,1205,1481,589,1 גבר,0,2 אֻמָּה שׁבח
1,Psalms 134:1 - Psalms 134:3,33,0.8955,427,5,33,1,0,0,0
2,Lament 5:19 - Lament 5:22,35,1.68,1264,337,1099,33,0,1 חדשׁ,0
3,Hosea 14:9 - Hosea 14:10,38,1.8382,1417,738,1266,125,1 בְּרֹושׁ,3 עָצָב רַעֲנָן שׁור,0
4,Psalms 131:1 - Psalms 131:3,46,1.7543,1243,609,1170,232,0,1 שׁוה,0
5,Psalms 119:137 - Psalms 119:144,51,2.5255,1624,1217,1563,620,2 פִּקּוּדִים צָעִיר,1 צמת,2 מָצֹוק שַׁעֲשׁוּעִים
6,Psalms 150:1 - Psalms 150:6,51,3.352,1737,1750,1745,1727,2 נְשָׁמָה נֵבֶל,3 גֹּדֶל רָקִיעַ תֹּף,7 מֵן מָחֹול עוּגָב צֶלְצֶלִים שֶׁמַע תֵּקַע
7,Malach 3:22 - Malach 3:24,52,1.0337,747,17,173,3,1 חֵרֶם,1 חֹרֵב,1 אֵלִיָּה
8,Psalms 93:1 - Psalms 93:5,52,2.6327,1631,1318,1646,957,2 אַדִּיר,1 אזר,4 גֵּאוּת דֳּכִי מִשְׁבָּר נאה
9,Psalms 119:25 - Psalms 119:32,52,2.2615,1562,1036,1484,445,3 פִּקּוּדִים רחב שׂיח,0,3 דלף שׁוה תּוּגָה


In [151]:
df.to_csv('../output/test.csv', sep='\t', encoding='utf-8')

In [None]:
# TODO
# Ruth
# Proverbs 10-13 & 14-19
# Eccl 3-9 & 9-11
# Gen 28:10 - 32:10

### Weight metrics comparison
#### 5-weight ranking:
```
scale = {
    'frequent': list(range(500, 50000)),    1
    'common':   list(range(250, 500)),      2
    'medium':   list(range(150, 250)),      3
    'uncommon': list(range(50, 150)),       5
    'rare':     list(range(1, 50))          8
}
```
#### 7-weight ranking
```
rank_scale = {
    'abundant': list(range(1000, 50000)),   1
    'frequent': list(range(500, 1000)),     1.5
    'common':   list(range(300, 500)),      2.25
    'average':  list(range(175, 300)),      3.5
    'uncommon': list(range(75, 175)),       5
    'rare':     list(range(35, 75)),        7.5
    'scarce':   list(range(1, 35))          11
}
```

#### 14 easiest passages
|    | 5-rank          | weight | U, R  | 7-rank          | weight | U, R, S  |
|----|-----------------|--------|-------|-----------------|--------|----------|
| 1  | Jer 26:7-15     | 1.3418 | 11, 5 | Jer 26:7-15     | 2.0767 | 9, 7, 1  |
| 2  | Ezek 37:11-28   | 1.3449 | 14, 7 | 2 Sam 7:18-29   | 2.1226 | 6, 4, 5  |
| 3  | 2 Sam 7:18-29   | 1.386  | 10, 6 | Deut 18:14-22   | 2.1545 | 7, 0, 7  |
| 4  | 2 Kings 8:1-6   | 1.4181 | 8, 3  | 1 Chr 17:3-15   | 2.1731 | 5, 7, 3  |
| 5  | 1 Chr 17:3-15   | 1.4187 | 6, 8  | Ezek 37:11-28   | 2.2009 | 16, 3, 5 |
| 6  | Ex 33:12-23     | 1.4191 | 4, 7  | Jer 35:12-18    | 2.2769 | 6, 3, 3  |
| 7  | Josh 21:43-22:6 | 1.4211 | 11, 2 | 1 Chr 17:16-27  | 2.2945 | 5, 5, 6  |
| 8  | Deut 18:14-22   | 1.4313 | 4, 7  | Josh 21:43-22:6 | 2.2952 | 8, 2, 2  |
| 9  | Ex 6:2-12       | 1.4399 | 8, 9  | 1 Kings 8:12-21 | 2.3107 | 8, 1, 4  |
| 10 | Jer 7:20-28     | 1.4461 | 7, 8  | Deut 31:1-13    | 2.3617 | 6, 10, 4 |
| 11 | Ex 6:29-7:13    | 1.4465 | 7, 9  | Ex 6:2-12       | 2.3786 | 7, 4, 6  |
| 12 | Jer 16:9-15     | 1.4568 | 6, 6  | Num 8:23-9:8    | 2.422  | 11, 3, 2 |
| 13 | 1 Sam 12:6-17   | 1.4605 | 5, 14 | Jer 44:24-29    | 2.4236 | 6, 5, 6  |
| 14 | Deut 31:1-13    | 1.4703 | 9, 9  | Ex 6:29-7:13    | 2.4237 | 5, 4, 6  |

Note: the counts for U (uncommon), R (rare), and S (scarce) words only include *unique* occurences. This means, for example, that if the scarce word תנין occured 5 times in the passage, it would only be counted once in the list of scarce words. 

#### TODO
- Experiment with different paragraph size metrics.
- Fine-tune the weight system.
    - Currently a passage is weighted by the lexical frequency of each (non-stop) word present. However, since in a paragraph's *word_ranks_data* I only include unique words, an infrequent word that occurs many times in a paragraph could unnecesarily penalize the weight. For example, if a word that only occurs 15 times has 8 of its occurences in the present paragraph, it would add a weight of 88 with the 7-rank system, even though the reader will become familiar with it by the end of the paragraph. 
    - To troubleshoot this, we could either only increase the weight of a passage according to each unique word, or decrease the weight each consecutive time it shows up in the paragraph. 
- Find better data for lexical forms.
    - The BHSA dataset doesn't follow the lexical forms in Strongs closely enough. 