In [1]:
from nltk.corpus import cmudict
import re
from pprint import pprint
import pyphen
# Download the cmudict data if you haven't already
# import nltk
# try:
#     nltk.data.find("corpora/cmudict.zip")
# except nltk.downloader.DownloadError:
#     nltk.download("cmudict")

In [2]:
with open("../data/kjv.txt", "r") as file:
    kjv_text = file.read()

clean_kjv_text = re.sub(r"[.,:;'()?!|]", "", kjv_text)

all_kjv_words = set()

for word in clean_kjv_text.split():
    all_kjv_words.add(word)

all_kjv_words = list(all_kjv_words)

print(f"Total number of unique words: {len(all_kjv_words)}")

Total number of unique words: 13725


In [3]:
pronunciations = cmudict.dict()


def count_syllables_cmudict(word: str, pronunciations_dict: dict) -> int:
    """Counts the syllables in a word using NLTK's CMUDict."""
    word_lower = word.lower()
    if word_lower not in pronunciations_dict:
        return 0
    # A word can have multiple pronunciations; we'll use the first one.
    return len(
        [
            phoneme
            for phoneme in pronunciations_dict[word_lower][0]
            if phoneme[-1].isdigit()
        ]
    )


def process_word(word: str, pronunciations_dict: dict, stats: dict) -> None:
    """
    Checks a word against CMUDict and various transformations, updating stats.
    Returns the word if it remains unrecognized, otherwise returns None.
    """
    if count_syllables_cmudict(word, pronunciations_dict) > 0:
        stats["Immediately recognized"] += 1
        return None

    if not str.islower(word):
        stats["Capital letter, skipped"] += 1
        return None

    # --- Suffix and Transformation Checks ---

    # Rule: Remove 'est' or 'eth'
    if word.endswith(("eth", "est")):
        base_word_3 = word[:-3]
        base_word_2 = word[:-2]
        if (
            count_syllables_cmudict(base_word_3, pronunciations_dict) > 0
            or count_syllables_cmudict(base_word_2, pronunciations_dict) > 0
        ):
            stats["Removed 'est' or 'eth', recognized"] += 1
            return None 

    # Rule: Remove 's' from end
    if word.endswith(("s")):
        transformed_word = word[:-1]
        if count_syllables_cmudict(transformed_word, pronunciations_dict) > 0:
            stats["Removed 's' from end, recognized"] += 1
            return None

    # Rule: Remove 1 extra character from the end (often after suffix removal fails)
    if word.endswith(("eth", "est")):
        base_word_4 = word[:-4]
        if count_syllables_cmudict(base_word_4, pronunciations_dict) > 0:
            stats["Removed 1 extra character from the end, recognized"] += 1
            return None

    # Rule: Replace 'iest' or 'ieth' with 'y'
    if word.endswith(("ieth", "iest")):
        transformed_word = re.sub(r"(iest|ieth)$", "y", word)
        if count_syllables_cmudict(transformed_word, pronunciations_dict) > 0:
            stats["Replaced 'iest' or 'ieth' with 'y', recognized"] += 1
            return None

    # Rule: Replace British 'our' with 'or'
    if "our" in word:
        transformed_word = re.sub(
            r"our", "or", word
        )
        if count_syllables_cmudict(transformed_word, pronunciations_dict) > 0:
            stats["Replaced British 'u', recognized"] += 1
            return None

    # Rule: Replace British 'our' with 'or'
    if "our" in word and word.endswith(("est", "eth")):
        transformed_word = re.sub(r"our\w*", "or", word)
        if count_syllables_cmudict(transformed_word, pronunciations_dict) > 0:
            stats["Replaced British 'u' and removed 'est' or 'eth', recognized"] += 1
            return None

    # Rule: Replace 'or'/'ors' with 'er'
    if word.endswith(("or", "ors")):
        transformed_word = re.sub(r"or\w?", "er", word)
        if count_syllables_cmudict(transformed_word, pronunciations_dict) > 0:
            stats["Replaced 'or'/'ors' with 'er', recognized"] += 1
            return None

    # If no rules matched and the word is unrecognized
    stats["Unrecognized and not caught by filters"] += 1
    return word


# --- Main Execution ---

difficult_words = []
word_stats = {
    "Immediately recognized": 0,
    "Capital letter, skipped": 0,
    "Unrecognized and not caught by filters": 0,
    "Removed 'est' or 'eth', recognized": 0,
    "Replaced 'iest' or 'ieth' with 'y', recognized": 0,
    "Removed 1 extra character from the end, recognized": 0,
    "Replaced British 'u', recognized": 0,
    "Replaced British 'u' and removed 'est' or 'eth', recognized": 0,
    "Replaced 'or'/'ors' with 'er', recognized": 0,
    "Removed 's' from end, recognized": 0,

}

for word in all_kjv_words:
    unrecognized = process_word(word, pronunciations, word_stats)
    if unrecognized:
        difficult_words.append(unrecognized)

print("\n--- Stats ---")
pprint(word_stats)


--- Stats ---
{'Capital letter, skipped': 3098,
 'Immediately recognized': 8280,
 "Removed 'est' or 'eth', recognized": 776,
 "Removed 's' from end, recognized": 192,
 'Removed 1 extra character from the end, recognized': 45,
 "Replaced 'iest' or 'ieth' with 'y', recognized": 20,
 "Replaced 'or'/'ors' with 'er', recognized": 2,
 "Replaced British 'u' and removed 'est' or 'eth', recognized": 5,
 "Replaced British 'u', recognized": 33,
 'Unrecognized and not caught by filters': 1274}


In [4]:
hyphen_gb = pyphen.Pyphen(lang="en_GB")
hyphen_us = pyphen.Pyphen(lang="en_US")

def count_syllables_pyphen(word: str):
    hyphenated_word_gb = hyphen_gb.inserted(word)
    syllable_count_gb = len(hyphenated_word_gb.split("-"))
    hyphenated_word_us = hyphen_us.inserted(word)
    syllable_count_us = len(hyphenated_word_us.split("-"))
    return max(syllable_count_us, syllable_count_gb)

print("word: syllables")

# My observation: in most cases, the best answer is the highest of the 2.
# Therefore, the best option would be to get the higher count.
for word in difficult_words[0:10]:
    word = re.sub("-", "", word)
    print(f"{word}: {count_syllables_pyphen(word)}")

word: syllables
eveningtide: 2
forbeareth: 2
sawn: 1
haply: 2
viol: 2
cloven: 1
appertaineth: 3
stumblingblocks: 3
menstealers: 3
scall: 1


In [5]:
pronunciations = cmudict.dict()
def get_syllable_count(word:str):
    syllable_count = count_syllables_cmudict(word, pronunciations)
    if syllable_count == 0:
        syllable_count = count_syllables_pyphen(word)
    return syllable_count

for word in all_kjv_words[0:10]:
    print(f"{word}: {get_syllable_count(word)}")

calling: 2
Zacchaeus: 4
Whoso: 1
spoiler: 2
eveningtide: 2
Arpad: 1
forbeareth: 2
Aharah: 2
offered: 2
idleness: 3
