# Final Project: Phonetic Similarity
Abigail Kayser (abigail.e.kayser.24@dartmouth.edu)<br>
Dartmouth College, LING48, Spring 2023


In [12]:
import nltk
from nltk.corpus import cmudict

# Load the CMU Pronouncing Dictionary
nltk.download('cmudict')
pronouncing_dict = cmudict.dict()

# Function to calculate phonetic similarity between two words
def calculate_phonetic_similarity(target_word, candidate_words):
    # Check if the target word exists in the CMU Pronouncing Dictionary
    if target_word.lower() not in pronouncing_dict:
        print("Target word not found in CMU Pronouncing Dictionary.")
        return []

    target_phonemes = pronouncing_dict[target_word.lower()]

    similarity_scores = []
    for candidate_word in candidate_words:
        # Check if the candidate word exists in the CMU Pronouncing Dictionary
        if candidate_word.lower() in pronouncing_dict:
            candidate_phonemes = pronouncing_dict[candidate_word.lower()]
            # Calculate the phonetic similarity using the intersection of phonemes
            similarity_score = len(set(target_phonemes[0]) & set(candidate_phonemes[0]))
            # Check for a perfect match
            if target_phonemes == candidate_phonemes:
                similarity_score = len(target_phonemes[0]) + 1  # Set a higher score for a perfect match
                similarity_scores.append((candidate_word, similarity_score, "Perfect Match!"))
            # Check for a rhyme
            elif target_phonemes[0][-1:] == candidate_phonemes[0][-1:]:
                similarity_scores.append((candidate_word, similarity_score, "Rhymes!"))
            else:
                similarity_scores.append((candidate_word, similarity_score, ""))
        else:
            similarity_scores.append((candidate_word, 0, ""))  # Assign a similarity score of 0 if candidate word not found

    return similarity_scores

# Example usage
target_word = "cat"
candidate_words = ["bat", "rat", "hat", "mat", "had", "dog", "cat"]

similarity_scores = calculate_phonetic_similarity(target_word, candidate_words)

# Sort the candidate words by their similarity score in descending order
similarity_scores.sort(key=lambda x: x[1], reverse=True)

# Print the candidate words and their similarity scores
for candidate, score, note in similarity_scores:
    if (score != 0):
        print(candidate, score, note)


[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/abbykayser/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


cat 4 Perfect Match!
bat 2 Rhymes!
rat 2 Rhymes!
hat 2 Rhymes!
mat 2 Rhymes!
had 1 


In [13]:
# Example usage
target_word = "shit"
candidate_words = ['than', 'time', ',', 'I', 'dance', 'hours', 'But', 'shot', '...', "'Cause", '(', ')', 'things', 'they', 'now', 'And', 'gas', 'Ooh', 'So', 'It', 'exciting', 'song', 'Got', 'Sorry', 'love', 'Are', 'blue', 'you', 'Glasses', 'You', 'and', 'night', 'or', 'more', 'Then', 'Wasted', 'This', 'days', 'counting', 'Watch', 'what', 'Have', 'in', 'of', 'Oh', 'cynical', 'Atari', 'hits', 'style', '.']

similarity_scores = calculate_phonetic_similarity(target_word, candidate_words)

# Sort the candidate words by their similarity score in descending order
similarity_scores.sort(key=lambda x: x[1], reverse=True)

# Print the candidate words and their similarity scores
for candidate, score, note in similarity_scores:
    if (score != 0):
        print(candidate, score, note)

shot 2 Rhymes!
It 2 Rhymes!
hits 2 
time 1 
But 1 Rhymes!
things 1 
exciting 1 
Got 1 Rhymes!
night 1 Rhymes!
Wasted 1 
This 1 
counting 1 
what 1 Rhymes!
cynical 1 
Atari 1 
style 1 
