# Final Project: KidzBopifying Lyrics
Cleo De Rocco (cleo.m.de.rocco.24@dartmouth.edu)<br>
Abigail Kayser (abigail.e.kayser.24@dartmouth.edu)<br>
Stefel Smith (stefel.s.smith.24@dartmouth.edu)<br>

Dartmouth College, LING48, Spring 2023


## Imports

In [2]:
# Upgrade from version in the VM
!pip install -U nltk==3.4
import nltk
nltk.download('punkt')

!pip install pronouncing
from nltk.corpus import cmudict
import pronouncing



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abbykayser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




In [3]:
import os
import requests
import io 
import random
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline
from nltk.lm import MLE, NgramCounter, Vocabulary
from nltk.util import ngrams
from collections import Counter
from nltk import word_tokenize, sent_tokenize, bigrams, trigrams
from nltk.stem import SnowballStemmer
import fasttext
import fasttext.util
import numpy as np
#import gdown
#import gdown

## Load fasstext English model

In [4]:
!curl -o en.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d en.bin.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4294M  100 4294M    0     0  22.0M      0  0:03:14  0:03:14 --:--:-- 25.6M
en.bin already exists -- do you wish to overwrite (y or n)? ^C


## Bigram Model

In [5]:
# Open file
file = io.open('combinedLyrics.txt', encoding='utf8')
text = file.read()

In [6]:
# BIGRAM - processing, training, and printing sequence
# Preprocess the tokenized text for language modelling
n = 2
paddedLine = [list(pad_both_ends(word_tokenize(text.lower()), n))]
train, vocab = padded_everygram_pipeline(n, paddedLine)

# Train a n-gram maximum likelihood estimation model.
bigram_model = MLE(n) 
bigram_model.fit(train, vocab)

# Tokenize the text into words
words = nltk.word_tokenize(text)
# Build frequency distribution of words that come after each word in the text

cfd = nltk.ConditionalFreqDist(
    (prev_word, next_word)
    for prev_word, next_word in nltk.bigrams(words)
)

# Define a function to get the top 10 most likely words that come after an input word
def get_top_words(input_word):
    # Get the frequency distribution for the input word
    freq_dist = cfd[input_word.lower()]

    # Get the top 10 most likely words that come after the input word
    top_10_words = freq_dist.most_common(200)

    # Return the top 10 words
    return [word[0] for word in top_10_words]

# Call the function with an input word and print the top 10 most likely words that come after it.
input_word = 'more'
top_10_words = get_top_words(input_word)
print(f'Top 10 words that come after "{input_word}":')
print(top_10_words)


Top 10 words that come after "more":
['than', 'time', ',', 'I', 'dance', 'hours', 'But', 'shot', '...', "'Cause", '(', ')', 'things', 'they', 'now', 'And', 'gas', 'Ooh', 'So', 'It', 'exciting', 'song', 'Got', 'Sorry', 'love', 'Are', 'blue', 'you', 'Glasses', 'You', 'and', 'night', 'or', 'more', 'Then', 'Wasted', 'This', 'days', 'counting', 'Watch', 'what', 'Have', 'in', 'of', 'Oh', 'cynical', 'Atari', 'hits', 'style', '.', 'People', 'seats', 'frustrated', 'famous', 'Than', 'Said', 'Keep', 'try', 'She', 'she', 'Run', 'air', 'every', 'let', 'can', '?', 'Someone', 'like', 'smart', 'messed', 'Well', 'to']


## Bad Word Bag

In [7]:
# Open file
with io.open('bad_words.txt', encoding='utf8') as file:
    text = file.read()

In [9]:
# Tokenize the bad words
bad_words = word_tokenize(text)

# Create a bag of words from the bad words
bag_of_words = nltk.FreqDist(bad_words)

stemmer = SnowballStemmer("english")

# Function to flag bad words in a sentence
def flag_bad_words(sentence):
    flagged_words = []
    tokens = word_tokenize(sentence)

    for word in tokens:
        new_word = stemmer.stem(word)
        if word in bag_of_words:
            flagged_words.append(word)
        elif new_word in bag_of_words:
            flagged_words.append(word)
            
    return flagged_words


## Phonetic Functions

In [10]:
#### PHONETIC ####

def calculate_phonetic_similarity(target_word, candidate_words):
    similarity_scores = {}
    
    # Load the CMU Pronouncing Dictionary
    pronouncing_dict = cmudict.dict()
    
    # Check if the target word exists in the CMU Pronouncing Dictionary
    if target_word.lower() not in pronouncing_dict:
        print("Target word not found in CMU Pronouncing Dictionary.")
        return similarity_scores
    
    target_phonemes = pronouncing_dict[target_word.lower()][0]
    
    for candidate_word in candidate_words:
        # Check if the candidate word exists in the CMU Pronouncing Dictionary
        if candidate_word.lower() in pronouncing_dict:
            candidate_phonemes = pronouncing_dict[candidate_word.lower()][0]
            # Calculate the phonetic similarity using the intersection of phonemes
            similarity_score = len(set(target_phonemes) & set(candidate_phonemes))
            if candidate_word in pronouncing.rhymes(target_word):
                similarity_scores[candidate_word] = (similarity_score + 1, "Rhymes!")
            else:
                similarity_scores[candidate_word] = (similarity_score, "")
        else:
            similarity_scores[candidate_word] = (0, "") # Assign a similarity score of 0 if candidate word not found
    
    return similarity_scores

def strict_rhymes(target_word, candidate_words, similarity_scores): 
    rhymes_from_ngram = []
    rhymes = []
    print_rhymes = []

    for word in candidate_words:
        if word in pronouncing.rhymes(target_word):
            rhymes_from_ngram.append(word)

    if len(rhymes_from_ngram) != 0:
        print("Yay, there is one or more matching rhymes from the dataset! Try using:")
        for rhyme in rhymes_from_ngram:
            print(rhyme, similarity_scores[rhyme])
    else:
        rhymes = pronouncing.rhymes(target_word)
        print("No rhymes were found from the dataset. Try any of these instead!")
        for rhyme in rhymes:
            if rhyme not in bad_words:
                print_rhymes.append(rhyme)
        print(print_rhymes)

    return print_rhymes

def calculate_suggestion(target_word, candidate_words):
    similarity_scores = calculate_phonetic_similarity(target_word, candidate_words)
    rhymes = strict_rhymes(target_word, candidate_words, similarity_scores)
    return rhymes

def phonetics(target_word, candidate_words):
    rhymes = calculate_suggestion(target_word, candidate_words)
    return rhymes
    

## Semantic Functions

In [11]:
#### SEMANTICS ####

embeddings = fasttext.load_model('en.bin')   #load embedding into memeory 

# takes in the bad word and a list of possible replacemnt words using the N gram model 
def similarity(bad_words, edits):
    edit_scores = {}
    for word in edits:
        if word not in text: 
            w1 = embeddings.get_word_vector(bad_words)
            w2 = embeddings.get_word_vector(word)
            dist = np.linalg.norm(w2 - w1)
            edit_scores[word] = dist 
    
    sorted_edit_scores =  sorted(edit_scores.items(), key=lambda x:x[1])
    return sorted_edit_scores 


def semantics(bad_word, suggested_edits):
    print(f"Top words Semantically Simliar to  {bad_word}.\n")
    print(similarity(bad_word,suggested_edits))



## Transformer Model

In [None]:
#### Transformer ####

## Meat of the Program, user input

In [16]:
def replace(sentence, words):
    for bad_word in words:
        word_before = sentence[sentence.index(bad_word) - 1]
        # print(type(sentence))
        # print(sentence.index(bad_word))
        # print(sentence[2], word_before)
        candidate_words = get_top_words(word_before)

        replacing = True
        while(replacing):
            replace_choice = input("Which type of replacement would you like for '" + str(bad_word) + "' ? Enter [p]honetic, [s]emantic, [b]oth phonetic and semantic, [t]ransformer, [a]ll, or [n]one:     ")
            if replace_choice == "p":
                replacing = False
                print("You chose phonetic replacement for " + str(bad_word) + ".")
                rhymes = phonetics(bad_word, candidate_words)
            elif replace_choice == "s":
                replacing = False
                print("You chose semantic replacement for " + str(bad_word) + ".")
                semantics(bad_word, candidate_words)
            elif replace_choice == "b":
                replacing = False
                print("You chose phonetic and semantic replacement for " + str(bad_word) + ".")
                both_candidates = phonetics(bad_word, [])
                semantics(bad_word, both_candidates)
            elif replace_choice == "t":
                replacing = False
                print("You chose transformer replacement for " + str(bad_word) + ".")
            elif replace_choice == "a":
                replacing = False
                print("You chose all replacements for " + str(bad_word) + ".")
            elif replace_choice == "n":
                replacing = False
                print("You chose not to replace " + str(bad_word) + ".")
            else:
                print("Please enter [p]honetic, [s]emantic, [t]ransformer, or [a]ll")   


In [17]:
responding = True
while(responding):
    userString = input("USER:     ")
    if userString == "exit":
        print("COMPUTER: Adios!")
        responding = False
        break
    print("You entered: ' " + str(userString) + "'")
    userString = userString.lower()
    output = flag_bad_words(userString)
    if not output:
        print("No bad words found.")
        continue


    replacing = True
    while(replacing):
        print("Flagged words: " + str(output))
        replace_want = input("Start replacement process for the word(s)? Enter [y] [n]:    ")
        if replace_want == "y":
            userString = userString.replace(",", "")
            replace(userString.split(), output)
            replacing = False
        elif replace_want == "n":
            print("Continue entering sentences to flag.")
            replacing = False
        else:
            print("Please enter [y] or [n]")


You entered: ' this is so fucked up'
Flagged words: ['fucked']
You chose phonetic and semantic replacement for fucked.
No rhymes were found from the dataset. Try any of these instead!
['abduct', 'bucked', 'chucked', 'conduct', 'construct', 'deconstruct', 'deduct', 'destruct', 'ducked', 'duct', 'induct', 'instruct', 'lucht', 'lucked', 'obstruct', 'plucked', 'reconstruct', 'self-destruct', 'shucked', 'sucked', 'trucked', 'tucked']
Top words Semantically Simliar to  fucked.

[('chucked', 1.0123838), ('self-destruct', 1.0574992), ('deconstruct', 1.0687076), ('reconstruct', 1.0861137), ('lucked', 1.1343501), ('construct', 1.1421357), ('plucked', 1.1502337), ('instruct', 1.1918026), ('shucked', 1.1945632), ('bucked', 1.2127217), ('destruct', 1.220667), ('obstruct', 1.2350947), ('ducked', 1.2380636), ('trucked', 1.2427006), ('conduct', 1.2672029), ('tucked', 1.2822165), ('deduct', 1.4913756), ('induct', 1.5326288), ('abduct', 1.5912011), ('lucht', 1.6631596), ('duct', 2.1647348)]
You entered: