In [1]:
from helpers import get_all_kjv_words, get_syllable_count, get_clean_text
from polyleven import levenshtein
import re
from time import time
from rapidfuzz import fuzz, process

In [2]:
all_kjv_words = get_all_kjv_words("../data/kjv.txt")
all_kjv_syllables = []

# Convert every word to a syllable count.
for word in all_kjv_words:
    syllable_count = get_syllable_count(word)
    all_kjv_syllables.append(syllable_count)

print(f"Number of words in KJV: {len(all_kjv_words)}")

for i in range(0,10):
    print(f"{all_kjv_words[i]}: {all_kjv_syllables[i]}")

Number of words in KJV: 789627
In: 1
the: 1
beginning: 3
God: 1
created: 3
the: 1
heaven: 2
and: 1
the: 1
earth: 1


In [3]:
# Psalm 67 has a tune. Let's see if it matches the rest of the verses in Psalm 67.
# The "Selah" is not included in the tune.
search_text = """
67 1 God be merciful unto us, and bless us; and cause his face to shine upon us; Selah.

2 That thy way may be known upon earth, thy saving health among all nations.

3 Let the people praise thee, O God; let all the people praise thee.

4 O let the nations be glad and sing for joy:
"""

clean_search_text = get_clean_text(search_text)
print(clean_search_text)


  God be merciful unto us and bless us and cause his face to shine upon us Selah

 That thy way may be known upon earth thy saving health among all nations

 Let the people praise thee O God let all the people praise thee

 O let the nations be glad and sing for joy



In [4]:
search_words = clean_search_text.split()
search_syllables = [get_syllable_count(word) for word in search_words]

print(f"Psalm 67 - {len(search_words)} words")

for i in range(0, 10):
    print(f"{search_words[i]}: {search_syllables[i]}")

Psalm 67 - 54 words
God: 1
be: 1
merciful: 3
unto: 2
us: 1
and: 1
bless: 1
us: 1
and: 1
cause: 1


In [5]:
# The exact match algorithms: 

# Tests of storing the syllables as a string, e.g. "1132111111" (str) vs [1,1,3,2,1,1,1,1,1,1] (list[int])
# have concluded that there is little or no difference at this scale, but converting the list to an int is indeed expensive.

def get_exact_match(main_syllables: str | list[int], search_syllables: str | list[int]):
    """Gets the locations of all exact matches by iteratively
    comparing each element of search_syllables to the elements
    in main_syllables, skipping the rest of the check if a mismatch is found.

    Accepts input as:
    - "1132111111" (str) 
    - [1,1,3,2,1,1,1,1,1,1] (list[int])"""
    match_locations = []
    start_time = time()
    for i in range(0, len(main_syllables)):
        match_flag = True
        for j in range(0, len(search_syllables)):
            # Not a match, break out of the loop and skip checking the rest for this position.
            if main_syllables[i + j] != search_syllables[j]:
                match_flag = False
                break
        if match_flag:
            match_locations.append(i)
    end_time = time()
    print(f"get_exact_match: {end_time - start_time:.2f} seconds")
    print(f"Number of exact matches: {len(match_locations)}")
    print(f"Locations of exact matches: {match_locations}")


def get_exact_match_optimized(
    main_syllables: str | list[int], search_syllables: str | list[int]
):
    """Gets the locations of all exact matches by iteratively
    comparing each element of search_syllables to the elements
    in main_syllables, skipping the rest of the check if a mismatch is found.

    Optimization: Finds the max value of search_syllables, then searches for that max value,
    and facilitates a faster search, because the max value is rarer than 1s or 2s

    In testing, this optimization resulted in 100% speedup vs non-optimized version: 0.16s -> 0.08s

    Accepts input as:
    - "1132111111" (str)
    - [1,1,3,2,1,1,1,1,1,1] (list[int])"""
    match_locations = []
    start_time = time()
    peak = max(search_syllables)
    peak_index = search_syllables.index(peak)
    for i in range(0, len(main_syllables)):
        if i + peak_index > len(main_syllables) - 1:
            break
        elif main_syllables[i+peak_index] != peak:
            continue
        match_flag = True
        for j in range(0, len(search_syllables)):
            # Not a match, break out of the loop and skip checking the rest for this position.
            if main_syllables[i + j] != search_syllables[j]:
                match_flag = False
                break
        if match_flag:
            match_locations.append(i)
    end_time = time()
    print(f"get_exact_match: {end_time - start_time:.2f} seconds")
    print(f"Number of exact matches: {len(match_locations)}")
    print(f"Locations of exact matches: {match_locations}")


def list_to_str(the_list: list[int]):
    start_time = time()

    the_str = ""
    for item in the_list:
        the_str += str(item)

    end_time = time()

    print(f"list_to_str: {end_time - start_time:.2f} seconds")

    return the_str


print(f"List index matching: e.g. {search_syllables[0:10]}")

get_exact_match(all_kjv_syllables, search_syllables)

print()

print(f"List index matching (optimized): e.g. {search_syllables[0:10]}")

get_exact_match_optimized(all_kjv_syllables, search_syllables)

print()


all_kjv_syllables_str = list_to_str(all_kjv_syllables)
search_syllables_str = list_to_str(search_syllables)

print()

print(f"String/substring matching: e.g. '{search_syllables_str[0:10]}'")

get_exact_match(all_kjv_syllables_str, search_syllables_str)

print()

print(f"String/substring matching (optimized): e.g. '{search_syllables_str[0:10]}'")

get_exact_match_optimized(all_kjv_syllables_str, search_syllables_str)

List index matching: e.g. [1, 1, 3, 2, 1, 1, 1, 1, 1, 1]
get_exact_match: 0.17 seconds
Number of exact matches: 1
Locations of exact matches: [396659]

List index matching (optimized): e.g. [1, 1, 3, 2, 1, 1, 1, 1, 1, 1]
get_exact_match: 0.08 seconds
Number of exact matches: 1
Locations of exact matches: [396659]

list_to_str: 0.09 seconds
list_to_str: 0.00 seconds

String/substring matching: e.g. '1132111111'
get_exact_match: 0.16 seconds
Number of exact matches: 1
Locations of exact matches: [396659]

String/substring matching (optimized): e.g. '1132111111'
get_exact_match: 0.08 seconds
Number of exact matches: 1
Locations of exact matches: [396659]


In [None]:
# For queries under 64 words, this returns the guaranteed closest match
fuzz.partial_ratio_alignment(
    all_kjv_syllables_str, # KJV Bible
    search_syllables_str # Psalm 67
)

ScoreAlignment(score=100.0, src_start=396659, src_end=396713, dest_start=0, dest_end=54)