In [1]:
from nltk.corpus import cmudict
import re

In [2]:
with open("../data/kjv.txt", "r") as file:
    text = file.read()

text = re.sub(r"[.,:;'()?!]", "", text)

all_kjv_words = set()

for word in text.split():
    all_kjv_words.add(word)

all_kjv_words = list(all_kjv_words)

print(f"Total number of unique words: {len(all_kjv_words)}")

Total number of unique words: 13734


In [3]:
pronunciations = cmudict.dict()

def count_syllables_cmudict(word: str, pronunciations: dict):
    """Counts the syllables in a word using NLTK's CMUDict."""
    if word.lower() in pronunciations:
        # A word can have multiple pronunciations, we'll use the first one.
        return len(
            [
                phoneme
                for phoneme in pronunciations[word.lower()][0]
                if phoneme[-1].isdigit()
            ]
        )
    else:
        return 0


print("""    List of all words in KJV, unrecognized by CMUDict and not caught by this filter.
    These words need to be manually mapped to a syllable count:""")

words_to_be_estimated_or_manually_entered = []

for word in all_kjv_words:
    # If word is unrecognized by CMUDict:
    if count_syllables_cmudict(word, pronunciations) == 0:
        # If word is not all lowercase, it is probably just a name:
        # e.g. "Ashdothpisgah", "A" is not lowercase, and it is a name.
        if not str.islower(word):
            continue
        # If word ends with "est" or "eth", it is probably just a normal word with that suffix.
        # e.g. "asketh" -> "ask" + "eth"
        word_suffix = word[-3:]
        if word_suffix == "eth" or word_suffix == "est":
            # If the word is still not recognized, even after taking off "st" or "th" or "est" or "eth":
            # e.g. "forbeareth" -> "forbear" is still not recognized
            if (
                count_syllables_cmudict(word[:-3], pronunciations)
                == 0 and count_syllables_cmudict(word[:-2], pronunciations)
                == 0
            ):
                # If word ends with "ieth" or "iest", and still cannot be recognized by changing the "i" to a "y":
                # e.g. "intermeddleth" -> "intermeddle" is still not recognized
                if count_syllables_cmudict(re.sub(r"(iest$|ieth$|est$|eth$)", "y", word), pronunciations) == 0:
                    # If removing an additional character from the end still does not allow it to be recognized:
                    # e.g. "blasphemest" -> "blasphe" is not recognized by CMUDict
                    if count_syllables_cmudict(word[:-4], pronunciations) == 0:
                        print(word)
                    # If the word had an extra character, which was stopping it from being recognized
                    # e.g. "slippeth" -> "slip", which can now be recognized
                    else:
                        continue
                # If the word is now recognized, after changing "ieth" or "iest" to "y":
                # e.g. "repliest" -> "reply", which can now be recognized
                # e.g. "blashpemest" -> "blasphemy", which can now be recognized
                else:
                    continue
            # If the word is now recognized, after removing the suffix:
            # e.g. "removeth" -> "remove" + "th"
            # e.g. "staggereth" -> "stagger" + "eth"
            else:
                continue
        else:
            continue
    # If word is recognized by CMUDict:
    else:
        continue

    List of all words in KJV, unrecognized by CMUDict and not caught by this filter.
    These words need to be manually mapped to a syllable count:
straiteneth
favourest
savourest
hearkeneth
appertaineth
bewaileth
reproveth
smotest
vaunteth
dishonoureth
defileth
dishonourest
honoureth
forbeareth
disannulleth
prophesieth
contemneth
swarest
wotteth
bewrayeth
stablisheth
intermeddleth
