In [1]:
# functionality import
import re
import pandas as pd

# data import
def _open(path):
    with open(path, 'r') as f:
        return tuple(f.read().split('\n'))

SYMBOLS = _open('./data/cmu-pronouncing-dictionary/cmudict.symbols')
ALL_PRONOUNCED_WORDS = _open('./data/cmu-pronouncing-dictionary/cmudict.dict')
phones = pd.read_csv('./data/cmu-pronouncing-dictionary/cmudict.phones', sep='\t', header=None)
# not pictured: pronunciation of punctuation in cmudict.vp
print(f'number of pronunciations: {len(ALL_PRONOUNCED_WORDS)}')

number of pronunciations: 135011


In [4]:
pd.np.log10(135011 ** 2)

10.260738308022356

In [2]:
# record all the identicalled spelled and identically pronounced words
alt_identifier = re.compile(r'\(\d+\)$')

spelling_to_pronunciation = {}
pronunciation_to_spelling = {}

for entry in ALL_PRONOUNCED_WORDS:
    split = entry.split(' ')
    if len(split) >= 2:
        word, *the_rest = split
        phonetic_tuple = tuple(the_rest)
        if alt_identifier.search(word):
            # this word spelling is an alternate pronunciation
            word = re.sub(alt_identifier, '', word)
            spelling_to_pronunciation[word].append(phonetic_tuple)
        else:
            spelling_to_pronunciation[word] = [phonetic_tuple]
        if phonetic_tuple in pronunciation_to_spelling:
            pronunciation_to_spelling[phonetic_tuple].append(word)
        else:
            pronunciation_to_spelling[phonetic_tuple] = [word]

In [4]:
homographic = {
    word:pronunciations
    for word, pronunciations in spelling_to_pronunciation.items()
    if len(pronunciations) > 1
}
homophonic = {
    pronunciation:words
    for pronunciation, words in pronunciation_to_spelling.items()
    if len(words) > 1
}

In [5]:
# quick interlude: counts of items in each set
for c in ('homographic', 'homophonic'):
    print(f'{c}:\t{len(eval(c))}')

homographic:	8417
homophonic:	13091


In [130]:
homophonic # most from plural/posessive confusion

{('B', 'AW1', 'T'): ["'bout", 'bout'],
 ('K', 'AO1', 'R', 'S'): ["'course", 'coarse', 'corse', 'course'],
 ('K', 'Y', 'UW1', 'Z'): ["'cuse",
  'cues',
  'kuse',
  'kuze',
  "q's",
  "q.'s",
  'q.s',
  'ques',
  'queues'],
 ('AH0', 'M'): ["'em", "'m", "i'm"],
 ('K', 'EY1'): ["'kay",
  'cay',
  'k',
  'k.',
  'kay',
  'kaye',
  'khe',
  'quai',
  'quay',
  'quaye'],
 ('AH0', 'N'): ["'n", 'an'],
 ('R', 'AW1', 'N', 'D'): ["'round", 'round'],
 ('EH1', 'S'): ["'s", 'es', 'ess', 'esse', 's', 's.'],
 ('T', 'IH1', 'L'): ["'til", 'til', 'till'],
 ('T', 'IH1', 'Z'): ["'tis", 'tis'],
 ('T', 'W', 'AH1', 'Z'): ["'twas", 'twas'],
 ('EY1',): ['a', 'a.', 'ae', 'ay'],
 ('EY1', 'Z'): ["a's", "a.'s", 'a.s'],
 ('AA1', 'K', 'ER0'): ['aaker', 'ocker'],
 ('AA1', 'N', 'K', 'AO2', 'R'): ['aancor', 'encore', 'oncor'],
 ('AA1', 'R', 'G'): ['aargh', 'argh'],
 ('EH1', 'R', 'AH0', 'N'): ['aaron', 'aran', 'aron', 'ehren'],
 ('EH1', 'R', 'AH0', 'N', 'Z'): ["aaron's", 'aarons', 'arens'],
 ('EH1', 'R', 'AH0', 'N', 'S', 

In [6]:
not_plural_or_posessive_homophones = {
    k:v for k,v in homophonic.items()
    if not  all(re.search(r"'?s'?$", m) for m in v)
}
not_plural_or_posessive_homophones

{('B', 'AW1', 'T'): ["'bout", 'bout'],
 ('K', 'AO1', 'R', 'S'): ["'course", 'coarse', 'corse', 'course'],
 ('K', 'Y', 'UW1', 'Z'): ["'cuse",
  'cues',
  'kuse',
  'kuze',
  "q's",
  "q.'s",
  'q.s',
  'ques',
  'queues'],
 ('AH0', 'M'): ["'em", "'m", "i'm"],
 ('K', 'EY1'): ["'kay",
  'cay',
  'k',
  'k.',
  'kay',
  'kaye',
  'khe',
  'quai',
  'quay',
  'quaye'],
 ('AH0', 'N'): ["'n", 'an'],
 ('R', 'AW1', 'N', 'D'): ["'round", 'round'],
 ('EH1', 'S'): ["'s", 'es', 'ess', 'esse', 's', 's.'],
 ('T', 'IH1', 'L'): ["'til", 'til', 'till'],
 ('EY1',): ['a', 'a.', 'ae', 'ay'],
 ('AA1', 'K', 'ER0'): ['aaker', 'ocker'],
 ('AA1', 'N', 'K', 'AO2', 'R'): ['aancor', 'encore', 'oncor'],
 ('AA1', 'R', 'G'): ['aargh', 'argh'],
 ('EH1', 'R', 'AH0', 'N'): ['aaron', 'aran', 'aron', 'ehren'],
 ('EH1', 'R', 'AH0', 'N', 'S', 'AH0', 'N'): ['aaronson', 'aronson'],
 ('AA1', 'S'): ['aase', 'os'],
 ('AA1', 'S', 'AH0', 'N'): ['aasen', 'osten'],
 ('AH0', 'B', 'AE1', 'D', 'IY0'): ['abadi', 'abadie'],
 ('AE1', 'B',

In [9]:
len([i for i in spelling_to_pronunciation if not re.search(r"'?s'?$",i)])

99556

In [10]:
pronunciation = tuple(pronunciation_to_spelling.keys())
have_overlaps = set()
for i in enumerate(pronunciation):
    for j in enumerate(pronunciation[i+1:]):
        if 

<enumerate at 0x7f5b7acb64c8>

In [None]:
pronunciation = tuple(pronunciation_to_spelling.keys())
df = pd.DataFrame(index=range(len(pronunciation)), columns=range(len(pronunciation)))

In [144]:
[not_plural_or_posessive_homophones[i] for i in _not_plural_or_posessive_homophones if i not in not_plural_or_posessive_homophones ]

[["'cuse", 'cues', 'kuse', 'kuze', "q's", "q.'s", 'q.s', 'ques', 'queues'],
 ["'s", 'es', 'ess', 'esse', 's', 's.'],
 ['aase', 'os'],
 ['aches', 'aix'],
 ['acts', 'ax', 'axe', 'x.'],
 ["ad's", 'adds', 'ads', "ads'", 'adz'],
 ['ahs', 'oz'],
 ['ais', 'ayes', 'eis', "eye's", 'eyes', "eyes'", "i's", "i.'s", 'i.s', 'ise'],
 ['aix', 'eckes', 'ex', 'x', 'x.'],
 ['alex', 'alexs'],
 ['alice', 'allis', 'alyce'],
 ['allais', 'allay'],
 ['americorp', 'americorps'],
 ['anas', 'anise'],
 ["angelo's", 'angelos', 'angeloz'],
 ["apec's", 'apex'],
 ['arcs', 'arx'],
 ["art's", 'arts', 'artz'],
 ["b's", "b.'s", 'b.s', "be's", 'beas', "bee's", 'bees', 'beese', 'bes'],
 ['baars', "bar's", "barr's", 'barres', 'barrs', 'bars', 'barz'],
 ['baatz', 'botts', 'botz'],
 ["back's", 'backes', 'backs', 'bax'],
 ['baetz', 'beats', 'beets', 'beitz', 'bietz'],
 ['baise', 'baize', "bay's", 'bayes', 'bays', 'bayse', 'baze'],
 ["ball's", 'balls', 'balz', 'bawls'],
 ["ban's", 'bans', 'benz'],
 ["bart's", 'barts', 'bartz'],


In [12]:
# attempt 1: get all puns separated by one edit
# restrict to swap-only edits
all_pronunciations = frozenset(i for i in pronunciation_to_spelling)
puns = {} # {pronounced_word: [puns of distance 1, puns of distance 2, ...]}

for word in all_pronunciations:
    for index in range(len(word)):
        for target_symbol in SYMBOLS:
            if target_symbol != word[index]:
                pun_attempt = tuple([*word[:index], target_symbol, *word[index + 1:]])
                if pun_attempt in all_pronunciations:
                    if word in puns:
                        puns[word][0].append(pun_attempt)
                    else:
                        puns[word] = [[pun_attempt]]

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


In [9]:
puns

{('Y', 'UW1', 'AO0', 'NG'): [[('Y', 'UW1', 'IH0', 'NG')]],
 ('T',
  'AY1',
  'ER0',
  'Z'): [[('B', 'AY1', 'ER0', 'Z'),
   ('F', 'AY1', 'ER0', 'Z'),
   ('HH', 'AY1', 'ER0', 'Z'),
   ('L', 'AY1', 'ER0', 'Z'),
   ('M', 'AY1', 'ER0', 'Z'),
   ('P', 'AY1', 'ER0', 'Z'),
   ('S', 'AY1', 'ER0', 'Z'),
   ('V', 'AY1', 'ER0', 'Z'),
   ('W', 'AY1', 'ER0', 'Z'),
   ('T', 'AW1', 'ER0', 'Z'),
   ('T', 'AY1', 'D', 'Z'),
   ('T', 'AY1', 'L', 'Z'),
   ('T', 'AY1', 'M', 'Z'),
   ('T', 'AY1', 'N', 'Z'),
   ('T', 'AY1', 'ER0', 'D')]],
 ('F',
  'AE1',
  'N',
  'CH',
  'ER0'): [[('HH', 'AE1', 'N', 'CH', 'ER0'),
   ('R', 'AE1', 'N', 'CH', 'ER0'),
   ('F', 'IH1', 'N', 'CH', 'ER0')]],
 ('R',
  'AA1',
  'S',
  'T'): [[('B', 'AA1', 'S', 'T'),
   ('D', 'AA1', 'S', 'T'),
   ('F', 'AA1', 'S', 'T'),
   ('JH', 'AA1', 'S', 'T'),
   ('K', 'AA1', 'S', 'T'),
   ('N', 'AA1', 'S', 'T'),
   ('R', 'AE1', 'S', 'T'),
   ('R', 'AH1', 'S', 'T'),
   ('R', 'EH1', 'S', 'T'),
   ('R', 'EY1', 'S', 'T'),
   ('R', 'IH1', 'S', 'T'),
   

In [11]:
# exhaustive swap-only 3-edit puns
# for word in all_pronunciations:
#     for i in range(len(word)):
#         for j in range(i + 1,len(word)):
#             if i != j:
#                 for symbol_i in SYMBOLS:
#                     for symbol_j in SYMBOLS:
#                         if symbol_i != word[i] and symbol_j != word[j]:
#                             pun_attempt = tuple([
#                                 *word[:i], symbol_i, *word[i + 1:j]])
#                             if pun_attempt in all_pronunciations:
#                                 if word in puns:
#                                     puns[word][0].append(pun_attempt)
#                                 else:
#                                     puns[word] = [[pun_attempt]]

11.443697499232712