# Lyric LDA and NMF

## Import libraries and json file 

In [1]:
import json
import matplotlib.pyplot as plt
import re
plt.style.use('ggplot')


json_file = open('../data_collection/data/song_lyrics.json')
song_lyrics_list = json.load(json_file)


## Create dataframe of song data

In [2]:
import pandas as pd

# construct dataframe with 1 row per song of:
# song  .... its name
# verse_lyrics .... self explanatory
# bridge_lyrics, chorus_lyrics, outro_lyrics
# song order .... Verse 1, Verse 2, Chorus, Verse 3 etc
processed_song_list = []
song_obj = song_lyrics_list[0]
for song_obj in song_lyrics_list:
    if song_obj['lyrics'] == '': continue
    obj = {}
    obj['song'] = song_obj['song']

    text = song_obj['lyrics']
    # remove initial \n
    while text[0] == '\n':
        text = text[1:]
    verse_markers = re.findall(r'\[(\w+[ \w]*)\]', text)
    obj['song_order'] = verse_markers
    # ', '.join(verse_markers) if you wanna make this a string

    # split lyrics by the markers we just found .. regex also includes []
    obj['verse_lyrics'] = ''
    obj['bridge_lyrics'] = ''
    obj['chorus_lyrics'] = ''
    obj['outro_lyrics'] = ''
    obj['all_lyrics'] = ''

    # https://stackoverflow.com/questions/10974932/split-string-based-on-a-regular-expression
    lyrics_arr = re.split(r'\[\w+[ \w]*\]', text)[1:] # start at 1 as index 0 is ''
    for verse_name, lyric_segment in zip(verse_markers, lyrics_arr):
        # clean up starts of lyrics
        while lyric_segment[0].isalpha() != True:
            lyric_segment = lyric_segment[1:]
        
        # treat verses
        if(verse_name[:5] ==  'Verse'):
            obj['verse_lyrics'] = obj['verse_lyrics'] + ' ' + lyric_segment
        elif(verse_name == 'Bridge'):
            obj['bridge_lyrics'] = obj['bridge_lyrics'] + ' ' + lyric_segment
        elif(verse_name == 'Chorus'):
            obj['chorus_lyrics'] = obj['chorus_lyrics'] + ' ' + lyric_segment
        elif(verse_name == 'Outro'):
            obj['outro_lyrics'] = obj['outro_lyrics'] + ' ' + lyric_segment
        
        # dump all lyrics with ' | ' between the verses
        obj['all_lyrics'] = obj['all_lyrics'] + ' | ' + lyric_segment

    # append to list
    processed_song_list.append(obj)


processed_song_df = pd.DataFrame(processed_song_list)
processed_song_df.head()


Unnamed: 0,song,song_order,verse_lyrics,bridge_lyrics,chorus_lyrics,outro_lyrics,all_lyrics
0,Fearless (Taylor’s Version),"[Verse 1, Verse 2, Chorus, Verse 3, Chorus, Br...",There's something 'bout the way\nThe street l...,"Well, you stood there with me in the doorway\...",And I don't know how it gets better than this...,"Oh, oh\nOh-oh, yeah\n\n",| There's something 'bout the way\nThe street...
1,Fifteen (Taylor’s Version),"[Verse 1, Chorus, Verse 2, Chorus, Bridge, Cho...",You take a deep breath and you walk through t...,When all you wanted was to be wanted\nWish yo...,Cause when you're fifteen and somebody tells ...,"Your very first day\nTake a deep breath, girl...",| You take a deep breath and you walk through...
2,Love Story (Taylor’s Version),"[Verse 1, Chorus, Verse 2, Chorus, Bridge, Cho...",We were both young when I first saw you\nI cl...,And I got tired of waiting\nWondering if you ...,"Romeo, take me somewhere we can be alone\nI'l...","Oh, oh, oh\nOh, oh, oh\n'Cause we were both y...",| We were both young when I first saw you\nI ...
3,Hey Stephen (Taylor’s Version),"[Intro, Verse 1, Chorus, Verse 2, Chorus, Brid...","Hey Stephen, I know looks can be deceiving\nB...","They're dimming the street lights, you're per...",Cause I can't help it if you look like an ang...,"Ah-uh, myself\nMmm-mm, I can't help myself\nI...","| Mmm-mm, mm-mm\nMmm-mm, mm-mm\nMmm-mm, mm-mm..."
4,White Horse (Taylor’s Version),"[Verse 1, Chorus, Verse 2, Chorus, Bridge, Cho...","Say you're sorry, that face of an angel\nCome...",And there you are on your knees\nBeggin' for ...,"That I'm not a princess, this ain't a fairyta...","Oh, whoa, whoa, whoa\nTry and catch me now, o...","| Say you're sorry, that face of an angel\nCo..."


## Fix up the lyrics

In [3]:
from nltk.corpus import stopwords

orig_lyrics_list = processed_song_df['all_lyrics'].to_list()

# fix | and \n and ,  
orig_lyrics_list = [i.replace('|',' ') for i in orig_lyrics_list]
orig_lyrics_list = [i.replace('\n',' ') for i in orig_lyrics_list]
orig_lyrics_list = [i.replace(',',' ') for i in orig_lyrics_list]

# try to remove stopwords (clearer what is going on)
lyrics_list = [i.split(' ') for i in orig_lyrics_list]
english_stopwords = stopwords.words('english')
for c, lyrics in enumerate(lyrics_list):
    lyrics_list[c] = [i for i in lyrics if (i!='' and i not in english_stopwords)]
    orig_lyrics_list[c] = ' '.join(lyrics_list[c]).lower()

# fix - and '
orig_lyrics_list = [i.replace('-','') for i in orig_lyrics_list]
orig_lyrics_list = [i.replace('-','') for i in orig_lyrics_list]

orig_lyrics_list[0]

"there's something 'bout way the street looks rained there's glow pavement walk car and know i wanna ask dance right in middle parking lot yeah oh yeah we're driving road i wonder know i'm trying hard get caught but cool run hands hair absentmindedly making want and i know gets better you take hand drag head first fearless and i know i'd dance in storm best dress fеarless so baby drive slow 'til run road onеhorse town i wanna stay right passenger's seat you put eyes in moment capture remember cause i know gets better you take hand drag head first fearless and i know i'd dance in storm best dress fearless oh oh well stood doorway my hands shake i'm usually way you pull i'm little brave it's first kiss flawless really something it's fearless oh yeah cause i know gets better you take hand drag head first fearless and i know i'd dance in storm best dress fearless 'cause i know gets better you take hand drag head first fearless and i know i'd dance in storm best dress fearless oh oh ohoh ye

## NMF

In [4]:
# NMF and LDA based on following:
# https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

categories = 5
no_top_words = 20

# Display function
def display_topics(model, feature_names, no_top_words):
    for i, topic in enumerate(model.components_):
        print("Topic %d:" % i)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# NMF
# tf-idf represenation
tfidf_vectorizer = TfidfVectorizer(token_pattern="(?u)[\w']*") # token pattern is because we include ' in our tokens
tfidf = tfidf_vectorizer.fit_transform(orig_lyrics_list)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf = NMF(n_components=categories, random_state=1, alpha=.1, l1_ratio=.4, init='nndsvd')
nmf.fit(tfidf)
# print the topics
display_topics(nmf, tfidf_feature_names, no_top_words)



Topic 0:
 i and know i'm you like never but time oh love knew come one want could my would still
Topic 1:
yourself fuck freckle free freedom freezing fresh freshman friend friends friеnd from front frozen frustrating frustratеd fucked gets full fun
Topic 2:
yourself fuck freckle free freedom freezing fresh freshman friend friends friеnd from front frozen frustrating frustratеd fucked gets full fun
Topic 3:
yourself fuck freckle free freedom freezing fresh freshman friend friends friеnd from front frozen frustrating frustratеd fucked gets full fun
Topic 4:
tallest spinning tiptoes mirrorball highest hush shining still heels i'm try version dear end love i'll tonight you'll near everything


## LDA

In [5]:
# LDA 
# bag of words
tf_vectorizer = CountVectorizer(token_pattern="(?u)[\w']*")
tf = tf_vectorizer.fit_transform(orig_lyrics_list)
tf_feature_names = tf_vectorizer.get_feature_names()
# create lda
# learning decay (0.5, 1.0] ... default .7
# learning_offset - greater than 1
lda = LatentDirichletAllocation(n_components=categories, learning_method='online', 
                                learning_offset=50 , learning_decay=.9, random_state=0, batch_size=10, max_iter=30)
lda.fit(tf)
print("Model Perplexity is " +  str(lda.bound_))

# print the topics
print('------------------------------------------')
display_topics(lda, tf_feature_names, no_top_words)




Model Perplexity is 52.169705564184625
------------------------------------------
Topic 0:
 i and i'm know you like but never time want one oh 'cause see come could i'd the back
Topic 1:
twice walking somewhere screaming clues onеhorse commit doorway curse dad life presents limbs bedsheets flawless yogurt mmm first sharks hit
Topic 2:
stole alright steel bеing stare sweetest intoxicating leaves divide kissing lookout our brag video staircase seven ones smirk on putting
Topic 3:
1 grows scared cliffside coming respects need riding shop balcony wishes adjusting cried choices someone's escape house bigger nevеr rodeo
Topic 4:
flannel known under shape 'round natural champion lifetimes pick photograph around ruin pages heart's happiness fears wait breaks creaks curve


Future investigations:
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0