In [20]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import cmudict
from collections import defaultdict

# Download the required NLTK resources
# nltk.download('wordnet')
# nltk.download('cmudict')


In [22]:
def is_trivial_variation(word1, word2):
    # Filter out possessive forms and common British vs. American spellings 
    # Not complete
    trivial_pairs = {('theater', 'theatre'), ('center', 'centre'), ('color', 'colour'),
                     ('flavor', 'flavour'), ('honor', 'honour'), ('rumor', 'rumour')}
    if word1.endswith("'s") or word2.endswith("'s") or word1.endswith("s'") or word2.endswith("s'"):
        return True
    if (word1, word2) in trivial_pairs or (word2, word1) in trivial_pairs:
        return True
    return False

def get_related_words(word):
    # Include synonyms, hypernyms, hyponyms, and derivationally related forms
    # Antonyms pending
    related_words = set()
    for synset in wn.synsets(word):
        related_words.update(lemma.name().replace('_', ' ') for lemma in synset.lemmas())
        related_words.update(lemma.name().replace('_', ' ') for hyper in synset.hypernyms() for lemma in hyper.lemmas())
        related_words.update(lemma.name().replace('_', ' ') for hypo in synset.hyponyms() for lemma in hypo.lemmas())
        related_words.update(lemma.name().replace('_', ' ') for lemma in synset.lemmas() for form in lemma.derivationally_related_forms())
    return related_words

def are_phonetically_similar(word1, word2, cmu_dict):
    # Check for phonetic similarity using CMU Pronouncing Dictionary
    pron1 = {tuple(phonemes) for phonemes in cmu_dict.get(word1, [])}
    pron2 = {tuple(phonemes) for phonemes in cmu_dict.get(word2, [])}
    return bool(pron1 & pron2)


In [33]:
class Node:
    """
    Represents a single node within the Anatree.
    
    Attributes:
        children (dict): A dictionary mapping characters to Node objects.
        word_end (bool): A flag indicating whether a word ends at this node.
        words (list): A list of complete words that terminate at this node.
    """
    def __init__(self):
        self.children = {}  # Initialize the children as an empty dictionary.
        self.word_end = False  # Initially, this node is not the end of any word.
        self.words = []  # Initialize an empty list to store words ending at this node.

class Anatree:
    """
    Represents the anagram tree (trie structure) used to store words in a way that
    facilitates the search for anagrams.
    
    The tree is built such that each path from the root to a word-end node represents
    a sorted sequence of characters from one or more words.

    Attributes:
        root (Node): The root node of the tree.
    """
    def __init__(self):
        self.root = Node()  # Start with a single root node.

    def insert(self, word):
        """
        Inserts a word into the Anatree. The characters of the word are sorted
        and used to navigate or expand the tree.

        Args:
            word (str): The word to be inserted into the tree.
        """
        node = self.root  # Start at the root of the tree.
        for char in sorted(word):  # Sort the word to treat anagrams the same.
            if char not in node.children:  # If char is not a child, add a new node.
                node.children[char] = Node()
            node = node.children[char]  # Move to the corresponding child node.
        node.word_end = True  # Mark that a word ends at this node.
        node.words.append(word)  # Add the original word to the node.

    def search_anagrams(self, word):
        """
        Searches for anagrams of a given word in the Anatree.

        Args:
            word (str): The word whose anagrams are to be found.

        Returns:
            list: A list of anagrams of the given word, if any exist.
        """
        node = self.root  # Start at the root of the tree.
        for char in sorted(word):  # Use sorted characters to search.
            if char in node.children:
                node = node.children[char]  # Move to the next node in the path.
            else:
                return []  # If any character is missing, no anagrams exist.
        return node.words if node.word_end else []  # Return words if it's a word-end node.


In [38]:
def process_anagrams(anatree, cmu_dict):
    """Process all words in the CMU dictionary to find all, semantic, and phonetic anagrams."""
    all_anagrams = defaultdict(list)
    semantic_anagrams = defaultdict(list)
    phonetic_anagrams = defaultdict(list)

    # Iterate over each word in the CMU dictionary
    for word in cmu_dict.keys():
        word_lower = word.lower()
        anagrams = anatree.search_anagrams(word_lower)

        # Collect all anagrams for each word, excluding the word itself
        filtered_anagrams = [anagram for anagram in anagrams if anagram != word_lower]
        if len(filtered_anagrams) > 0:  # Store only if there are other anagrams
            all_anagrams[word_lower] = filtered_anagrams

        related_words = get_related_words(word_lower)

        # Evaluate semantic and phonetic relationships among the anagrams
        for anagram in anagrams:
            if anagram != word_lower and not is_trivial_variation(word_lower, anagram):
                if anagram in related_words:
                    semantic_anagrams[word_lower].append(anagram)
                if are_phonetically_similar(word_lower, anagram, cmu_dict):
                    phonetic_anagrams[word_lower].append(anagram)

    return all_anagrams, semantic_anagrams, phonetic_anagrams

# Load CMU Pronouncing Dictionary
cmu_dict = cmudict.dict()

# Create an Anatree and insert words
anatree = Anatree()
for word in cmu_dict.keys():
    anatree.insert(word.lower())

# Using the function to find anagrams
all_anagrams, semantic_similar_anagrams, phonetic_similar_anagrams = process_anagrams(anatree, cmu_dict)


In [44]:
# Print some anagrams to verify
print("All Anagrams:")
for key, value in list(all_anagrams.items())[:10]:
    print(f"{key}: {value}")

print("\nSemantic Similar Anagrams:")
for key, value in list(semantic_similar_anagrams.items())[:10]:
    print(f"{key}: {value}")

print("\nPhonetic Similar Anagrams:")
for key, value in list(phonetic_similar_anagrams.items())[:-10]:
    print(f"{key}: {value}")

All Anagrams:
aachen: ['eachan']
aamodt: ['damato']
aancor: ['carano']
aardema: ['amerada']
aaron: ['anora']
aarti: ['atari', 'atria', 'taira', 'tiara']
aase: ['asea']
aasen: ['asean', 'enasa']
aback: ['baack']
abad: ['bada']

Semantic Similar Anagrams:
accouterment: ['accoutrement']
accoutrement: ['accouterment']
amphitheater: ['amphitheatre']
amphitheatre: ['amphitheater']
angered: ['enraged']
ate: ['eat']
bate: ['beat']
beat: ['bate']
beats: ['baste']
being: ['begin']

Phonetic Similar Anagrams:
abel: ['able']
abels: ['ables']
able: ['abel']
ables: ['abels']
accouterment: ['accoutrement']
accouterments: ['accoutrements']
accoutrement: ['accouterment']
accoutrements: ['accouterments']
ack: ['akc']
aires: ['aries']
akc: ['ack']
aker: ['akre']
akre: ['aker']
aleen: ['alene']
alene: ['aleen']
appel: ['apple']
appelbaum: ['applebaum']
appelman: ['appleman']
apple: ['appel']
applebaum: ['appelbaum']
appleman: ['appelman']
aries: ['aires']
axel: ['axle']
axle: ['axel']
baek: ['beak']
baer:

In [43]:
len(cmu_dict)

123455

In [40]:
len(all_anagrams)

32706

In [45]:
semantic_similar_anagrams

defaultdict(list,
            {'accouterment': ['accoutrement'],
             'accoutrement': ['accouterment'],
             'amphitheater': ['amphitheatre'],
             'amphitheatre': ['amphitheater'],
             'angered': ['enraged'],
             'ate': ['eat'],
             'bate': ['beat'],
             'beat': ['bate'],
             'beats': ['baste'],
             'being': ['begin'],
             'casinos': ['cassino'],
             'cats': ['cast'],
             'cleaners': ['cleanser'],
             'cleared': ['declare'],
             'detail': ['dilate'],
             'dilate': ['detail'],
             'elections': ['selection'],
             'elects': ['select'],
             'enraged': ['angered'],
             'fended': ['defend'],
             'fiber': ['fibre'],
             'fiberboard': ['fibreboard'],
             'fibre': ['fiber'],
             'fibreboard': ['fiberboard'],
             'formed': ['deform'],
             'hotels': ['hostel'],
             'lo

In [42]:
len(phonetic_similar_anagrams)

968