## word2vec Exploration

In [9]:
import numpy as np
import pandas as pd
import gensim
import sklearn.manifold
import re
import time
import pprint as pp
from collections import Counter
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

## Load Data and Models

In [10]:
sg_model = gensim.models.Word2Vec.load('models/sg_model.w2v')
cbow_model = gensim.models.Word2Vec.load('models/cbow_model.w2v')
lyrics_data = pd.read_csv("LyricsFreak.csv")

## Helper Methods

In [18]:
def split_lyric(lyrics):   
    splitted_lyrics = re.sub(' +', ' ', clean_lyric(lyrics)).split(' ')
    return [ word.lower() for word in splitted_lyrics if word != '']

def clean_lyric(lyric):
    punct_str = '!\"#$%&\\()*+,-./:;<=>/?@[]^_`{|}~«»\n'
    for p in punct_str:
        lyric = lyric.replace(p,' ')
    return lyric

def cbow_songVector(row):
    vector_sum = 0
    words = split_lyric(row)
    for word in words:
        vector_sum = vector_sum + cbow_model[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum
    
def sg_songVector(row):
    vector_sum = 0
    words = split_lyric(row)
    for word in words:
        vector_sum = vector_sum + sg_model[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum

def sum_vectors(lista,model):
    vector_sum = 0
    for word in lista:
        vector_sum = vector_sum + model[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum

def display_most_common(word, models, num):
    print('\n'+ word)
    for model in models:
        print(model[0])
        predictions = model[1].most_similar(positive=[word],topn=num)
        pp.pprint(list(map(lambda x: x[0] + ' => ' + str(x[1]),predictions)))


In [4]:
start_time = time.time()
#lyrics_data['cbow_song_vector'] = lyrics_data['text'].apply(cbow_songVector)
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
#lyrics_data['sg_song_vector'] = lyrics_data['text'].apply(sg_songVector)
print("--- %s seconds ---" % (time.time() - start_time))

--- 7.796287536621094e-05 seconds ---
--- 7.295608520507812e-05 seconds ---


## Most Common Words

In [12]:
stops = stopwords.words('english')
file = open("lyrics_text.txt", 'r+',encoding='utf-8-sig')
wordcount = Counter(word for word in file.read().split() if word not in stops)

In [15]:
wordcount.most_common(10)

[("i'm", 105835),
 ('love', 92539),
 ("don't", 81028),
 ('know', 72551),
 ("it's", 68750),
 ('like', 63617),
 ('oh', 63040),
 ('got', 50299),
 ('one', 44945),
 ('go', 44280)]

## Examples

In [19]:
test_words = ['yeah', 'feel','girl','heart','take','life','back','never','die','away','give','time','night','day','man','dream','world','little']
test_words2 = ['baby','good','keep','around','again','eye','mind','fall','world','hell','we','like','breathe','ya','you','thang','get','fire','rock',"don't", 'woman','music','dancing','lonely',]

models = [('skipgram',sg_model), ('cbow',cbow_model)]

for word in test_words:
    display_most_common(word,models,5)
    
for word in test_words2:
    display_most_common(word,models,5)
    
    

        


yeah
skipgram
['ooh => 0.804219126701355',
 'whew => 0.790367841720581',
 'oh => 0.7862958312034607',
 'whoa => 0.7658944129943848',
 'whoaaa => 0.7559748291969299']
cbow
['oh => 0.811190664768219',
 'ooh => 0.7243115305900574',
 'whoa => 0.7111584544181824',
 'shoobedoop => 0.6727779507637024',
 'baby => 0.6712197661399841']

feel
skipgram
['feels => 0.7562969923019409',
 'uncomfortable => 0.7107282876968384',
 "rhythm's => 0.709407389163971",
 'felt => 0.7055429220199585',
 'tantalize => 0.7048778533935547']
cbow
['feels => 0.6476834416389465',
 'feeling => 0.5865485668182373',
 'know => 0.5475054383277893',
 "feelin' => 0.5451181530952454",
 'touch => 0.5211158990859985']

girl
skipgram
['tko => 0.759039580821991',
 'boy => 0.755138099193573',
 'pageant => 0.7467314004898071',
 'baby => 0.7417809963226318',
 'flirtin => 0.7287603616714478']
cbow
['boy => 0.6998615860939026',
 'woman => 0.6710625290870667',
 'baby => 0.6699797511100769',
 'lady => 0.6120254993438721',
 'lover => 0.5