In [1]:
from rapidfuzz import fuzz, process
from helpers import get_all_kjv_syllables_str, text_to_syllables_str, get_all_kjv_words, get_clean_text

In [2]:
all_kjv_syllables_str = get_all_kjv_syllables_str()

In [3]:
# https://hymnary.org/text/i_know_not_why_gods_wondrous_grace_to_me
search_text = """
    But “I know whom I have believed,
    and am persuaded that he is able
    to keep that which I’ve committed
    unto him against that day.”
    """
search_syllables_str = text_to_syllables_str(search_text)
print(search_syllables_str)

1111112113111211111321211


In [4]:
def get_fuzzy_matches(
    search_syllables_str: str, kjv_syllables_str: str, score_cutoff: float
):
    """
    This function scans the entire kjv_syllables_str for matches with search_syllables_str.
    Returns matches that have a score above the score_cutoff.

    - Input:
        - search_syllables_str: The short string to be a query
            - e.g. '1111112113111211111321211'
        - kjv_syllables_str: The long string to be searched
            - e.g. '1132111111' + (...) + '111112111112'
        - score_cutoff: The minimum score that will be returned
            - e.g. 95.0
    - Output
        - list[tuple[
            - matching_section: The string matching the query string
                - e.g. '1111112113111211111132121'
            - score: The score of this match
                - e.g. 96.0
            - index: The index of the first word in the kjv_syllables_str that matched the search_syllables_str
                - e.g. 757980
            ]]
    """
    step = 1 
    search_length = len(search_syllables_str)
    full_length = len(kjv_syllables_str)
    result = process.extract(
        query=search_syllables_str,
        choices=(
            kjv_syllables_str[i : i + search_length]
            for i in range(0, full_length - search_length + 1, step)
        ),
        scorer=fuzz.ratio,
        limit=None,
        score_cutoff=score_cutoff,
    )
    return result

print(get_fuzzy_matches(search_syllables_str, all_kjv_syllables_str, 95)[0:10])

[('1111112111311111111321211', 96.0, 93112), ('1111111211311121111131211', 96.0, 138748), ('1111112113111211111312111', 96.0, 138749), ('1111111211311121111121211', 96.0, 222955), ('1111112113111211111212111', 96.0, 222956), ('1111112111112111113121211', 96.0, 358398), ('1111112211311121111121211', 96.0, 724330), ('1111112113111211111132121', 96.0, 757980), ('1111121131112111111321211', 96.0, 757981)]


In [5]:
all_kjv_words = get_all_kjv_words()
print(all_kjv_words[757980 : 757980 + 25])

['for', 'I', 'know', 'whom', 'I', 'have', 'believed', 'and', 'am', 'persuaded', 'that', 'he', 'is', 'able', 'to', 'keep', 'that', 'which', 'I', 'have', 'committed', 'unto', 'him', 'against', 'that']


In [6]:
clean_search_text = get_clean_text(search_text)

for i in range(0,25):
    print(f"{all_kjv_words[93112+i]} / {clean_search_text.split()[i]}")

that / But
he / I
was / know
sold / whom
to / I
him / have
unto / believed
the / and
year / am
of / persuaded
jubilee / that
and / he
the / is
price / able
of / to
his / keep
sale / that
shall / which
be / Ive
according / committed
unto / unto
the / him
number / against
of / that
years / day
