# Final Project: Phonetic Similarity
Abigail Kayser (abigail.e.kayser.24@dartmouth.edu)<br>
Dartmouth College, LING48, Spring 2023


In [31]:
import nltk
from nltk.corpus import cmudict
import pronouncing

# Load the CMU Pronouncing Dictionary
nltk.download('cmudict')
pronouncing_dict = cmudict.dict()

# Function to calculate phonetic similarity between two words
def calculate_phonetic_similarity(target_word, candidate_words):
    similarity_scores = []
    
    # Load the CMU Pronouncing Dictionary
    pronouncing_dict = cmudict.dict()
    
    # Check if the target word exists in the CMU Pronouncing Dictionary
    if target_word.lower() not in pronouncing_dict:
        print("Target word not found in CMU Pronouncing Dictionary.")
        return similarity_scores
    
    target_phonemes = pronouncing_dict[target_word.lower()][0]
    
    for candidate_word in candidate_words:
        # Check if the candidate word exists in the CMU Pronouncing Dictionary
        if candidate_word.lower() in pronouncing_dict:
            candidate_phonemes = pronouncing_dict[candidate_word.lower()][0]
            # Calculate the phonetic similarity using the intersection of phonemes
            similarity_score = len(set(target_phonemes) & set(candidate_phonemes))
            if candidate_word in pronouncing.rhymes(target_word):
                similarity_scores.append((candidate_word, similarity_score, "Rhymes!"))
            else:
                similarity_scores.append((candidate_word, similarity_score, ""))
        else:
            similarity_scores.append((candidate_word, 0, ""))  # Assign a similarity score of 0 if candidate word not found
    
    return similarity_scores


# Example usage
target_word = "cat"
candidate_words = ["bat", "rat", "hat", "mat", "had", "dog"]

similarity_scores = calculate_phonetic_similarity(target_word, candidate_words)

# Sort the candidate words by their similarity score in descending order
similarity_scores.sort(key=lambda x: x[1], reverse=True)

# Print the candidate words and their similarity scores
for candidate, score, note in similarity_scores:
    if (score != 0):
        print(candidate, score, note)


[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/abbykayser/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


bat 2 Rhymes!
rat 2 Rhymes!
hat 2 Rhymes!
mat 2 Rhymes!
had 1 


In [32]:
# Example usage
target_word = "shit"
candidate_words = ['than', 'time', ',', 'I', 'dance', 'hours', 'But', 'shot', '...', "'Cause", '(', ')', 'things', 'they', 'now', 'And', 'gas', 'Ooh', 'So', 'It', 'exciting', 'song', 'Got', 'Sorry', 'love', 'Are', 'blue', 'you', 'Glasses', 'You', 'and', 'night', 'or', 'more', 'Then', 'Wasted', 'This', 'days', 'counting', 'Watch', 'what', 'Have', 'in', 'of', 'Oh', 'cynical', 'Atari', 'hits', 'style', '.', 'People', 'seats', 'frustrated', 'famous', 'Than', 'Said', 'Keep', 'try', 'She', 'she', 'Run', 'air', 'every', 'let', 'can', '?', 'Someone', 'like', 'smart', 'messed', 'Well', 'to']

similarity_scores = calculate_phonetic_similarity(target_word, candidate_words)

# Sort the candidate words by their similarity score in descending order
similarity_scores.sort(key=lambda x: x[1], reverse=True)

# Print the candidate words and their similarity scores
for candidate, score, note in similarity_scores:
    if (score != 0):
        print(candidate, score, note)

shot 2 
It 2 
hits 2 
time 1 
But 1 
things 1 
exciting 1 
Got 1 
night 1 
Wasted 1 
This 1 
counting 1 
what 1 
cynical 1 
Atari 1 
style 1 
seats 1 
frustrated 1 
try 1 
She 1 
she 1 
let 1 
smart 1 
messed 1 
to 1 


In [2]:
pip install pronouncing

Collecting pronouncing
  Downloading pronouncing-0.2.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting cmudict>=0.4.0
  Downloading cmudict-1.0.13-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.3/939.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-resources<6.0.0,>=5.10.1
  Downloading importlib_resources-5.12.0-py3-none-any.whl (36 kB)
Collecting importlib-metadata<6.0.0,>=5.1.0
  Downloading importlib_metadata-5.2.0-py3-none-any.whl (21 kB)
Building wheels for collected packages: pronouncing
  Building wheel for pronouncing (setup.py) ... [?25ldone
[?25h  Created wheel for pronouncing: filename=pronouncing-0.2.0-py2.py3-none-any.whl size=6252 sha256=6bb30736a7e2bd964b293113d188cf6d31a1bafd8f7f0156017bf329c1f48dea
  Stored in directory: /Users/abbykayser/Library/Caches/pip/wheels/ee/d4/c2/fb8c0e2009b75358874506ff2ce1ee79370b6ef5cf08922206
Successfully built pronouncing
Installing 

In [33]:
import pronouncing
target_word = "beer"
candidate_words = ['to', 'a', 'fun', 'the', 'you', 'been', 'it', 'known', 'some', 'mercy', 'this', 'I', 'put', 'high', 'traveled', "n't", 'waited', 'Somebody', 'told', 'no', 'got', 'never', 'faith', 'changed', 'made', 'your', 'an', 'seen', 'any', 'one', 'That', 'all', 'said', 'done', ',', 'become', 'found', 'me', 'is', 'And', 'his', 'The', 'even', 'decided', 'ever', 'stayed', 'another', 'pain', 'sorrow', 'things', 'given', 'How', 'treated', 'history', '...', 'so', 'everything', 'ya', 'had', 'loved', '?', 'groupies', 'started', 'good', 'each', 'crossed', 'time', 'Like', 'For', 'what', 'held', 'let', 'fathomed', 'hands', 'my', 'digital', 'diddly']
def strict_rhymes(target_word, candidate_words): 
    rhymes_from_ngram = []
    rhymes = []

    yes = "wheeze" in pronouncing.rhymes("cheese")
    for word in candidate_words:
        if word in pronouncing.rhymes(target_word):
            rhymes_from_ngram.append(word)

    if len(rhymes_from_ngram) != 0:
        print("Yay, there is one or more matching rhymes from the dataset! Try using:")
        print(rhymes_from_ngram)
    else:
        rhymes = pronouncing.rhymes(target_word)
        print("No rhymes were found from the dataset. Try any of these instead!")
        print(rhymes)
    return

strict_rhymes(target_word, candidate_words)
        

No rhymes were found from the dataset. Try any of these instead!
['adhere', 'alvear', 'amir', 'amir', 'appear', 'auctioneer', 'austere', 'bahir', 'bandolier', 'bashir', 'bassir', 'beare', 'bebear', 'belvedere', 'belvedere', 'bere', 'bioengineer', 'bombardier', 'briere', 'brigadeer', 'brigadier', 'budgeteer', 'career', 'casebeer', 'casebeer', 'cashier', 'cavalier', 'chachere', 'chandelier', 'charpentier', 'cheer', 'chevalier', 'chusmir', 'clear', 'cleere', 'cohere', 'commandeer', 'conventioneer', 'crear', 'creer', 'crochetiere', 'cyr', 'davir', 'dear', 'deer', 'deere', 'desir', 'disappear', 'domineer', 'dornier', 'dubilier', 'dyneer', 'dyneer', 'ear', 'electioneer', 'emir', 'emir', 'engineer', 'exovir', 'fear', 'fier', 'financier', 'financier', 'financiere', 'fleer', 'four-year', 'frear', 'freire', 'frere', 'frontier', 'gear', 'geer', 'gere', 'gier', 'giere', 'gondolier', 'grear', 'greer', 'guinier', 'imagineer', 'insincere', 'interfere', 'interfere', 'jeer', 'kamir', 'kear', 'kier', 'k

In [34]:
def give_phonetic_sim(target_word, candidate_words):
    similarity_scores = calculate_phonetic_similarity(target_word, candidate_words)

    # Sort the candidate words by their similarity score in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)

    # Print the candidate words and their similarity scores
    for candidate, score, note in similarity_scores:
        if (score != 0):
            print(candidate, score, note)
    return

give_phonetic_sim(target_word, candidate_words)


been 2 
it 1 
this 1 
traveled 1 
Somebody 1 
your 1 
become 1 
is 1 
his 1 
sorrow 1 
things 1 
given 1 
treated 1 
history 1 
everything 1 
groupies 1 
started 1 
crossed 1 
For 1 
digital 1 
