## Word2Vec Exploration

In [2]:
import numpy as np
import pandas as pd
import gensim
import sklearn.manifold
import re
import time
from collections import Counter
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [3]:
sg_model = gensim.models.Word2Vec.load('models/sg_model.w2v')
cbow_model = gensim.models.Word2Vec.load('models/cbow_model.w2v')
lyrics_data = pd.read_csv("LyricsFreak.csv")

In [3]:
def split_lyric(lyrics):   
    splitted_lyrics = re.sub(' +', ' ', clean_lyric(lyrics)).split(' ')
    return [ word.lower() for word in splitted_lyrics if word != '']

def clean_lyric(lyric):
    punct_str = '!\"#$%&\\()*+,-./:;<=>/?@[\\\\]^_`{|}~«»\\n'
    for p in punct_str:
        lyric = lyric.replace(p,' ')
    return lyric

def cbow_songVector(row):
    vector_sum = 0
    words = split_lyric(row)
    for word in words:
        vector_sum = vector_sum + cbow_model[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum
    
def sg_songVector(row):
    vector_sum = 0
    words = split_lyric(row)
    for word in words:
        vector_sum = vector_sum + sg_model[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum

def sum_vectors(lista,model):
    vector_sum = 0
    for word in lista:
        vector_sum = vector_sum + model[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum

In [6]:
start_time = time.time()
#lyrics_data['cbow_song_vector'] = lyrics_data['text'].apply(cbow_songVector)
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
#lyrics_data['sg_song_vector'] = lyrics_data['text'].apply(sg_songVector)
print("--- %s seconds ---" % (time.time() - start_time))

--- 7.510185241699219e-05 seconds ---
--- 7.104873657226562e-05 seconds ---


In [15]:
stops = stopwords.words('english')
file = open("lyrics_text.txt", 'r+',encoding='utf-8-sig')
wordcount = Counter(word for word in file.read().split() if word not in stops)

In [27]:
wordcount.most_common(20)

[("i'm", 105835),
 ('love', 92539),
 ("don't", 81028),
 ('know', 72551),
 ("it's", 68750),
 ('like', 63617),
 ('oh', 63040),
 ('got', 50299),
 ('one', 44945),
 ('go', 44280),
 ('time', 43479),
 ('get', 43450),
 ("you're", 42946),
 ('baby', 41135),
 ('see', 41100),
 ('want', 39818),
 ('never', 39506),
 ('come', 36289),
 ("can't", 36247),
 ('yeah', 34570)]

In [40]:
print("Love")
sg_model.most_similar(positive=['love'],topn=5)

Love


[("understandin'", 0.8563092350959778),
 ('reassure', 0.8481371998786926),
 ('posses', 0.8461930751800537),
 ('belonging', 0.8394216895103455),
 ('loving', 0.8385530710220337)]

In [41]:
print("Love")
cbow_model.most_similar(positive=['love'],topn=5)

Love


[('loving', 0.8135256171226501),
 ('baby', 0.7297697067260742),
 ('you', 0.7092635631561279),
 ("lovin'", 0.7057769894599915),
 ('oh', 0.70156329870224)]

In [44]:
print("Hate")
sg_model.most_similar(positive=['hate'],topn=5)

Hate


[('needless', 0.8225346207618713),
 ('extent', 0.8090395331382751),
 ('violated', 0.8051789402961731),
 ('hypocrite', 0.8036386370658875),
 ('mislead', 0.7972413301467896)]

In [47]:
print("Hate")
cbow_model.most_similar(positive=['hate'],topn=5)

Hate


[('sick', 0.6256549954414368),
 ('deserve', 0.5560274124145508),
 ('need', 0.5546591281890869),
 ('know', 0.5512157082557678),
 ('understand', 0.547520637512207)]

In [48]:
sg_model.similarity('hate', 'love')

0.7241368290421204

In [49]:
cbow_model.similarity('hate','love')

0.39338540915322229

In [50]:
print("Bitch")
sg_model.most_similar(positive=['bitch'],topn=5)

Bitch


[('nigga', 0.916361927986145),
 ('ass', 0.9091150164604187),
 ('hoe', 0.9008374214172363),
 ('shit', 0.8850321173667908),
 ('homey', 0.8825402855873108)]

In [51]:
print("Bitch")
cbow_model.most_similar(positive=['bitch'],topn=5)

Bitch


[('nigga', 0.8731383681297302),
 ('motherfucker', 0.8689953684806824),
 ('hoe', 0.8627142310142517),
 ('ass', 0.8226838111877441),
 ('shit', 0.8193044662475586)]

In [53]:
print("Death")
sg_model.most_similar(positive=['death'],topn=5)

Death


[('undead', 0.8419730067253113),
 ('pestilence', 0.8320827484130859),
 ('violence', 0.8241569995880127),
 ('vengeance', 0.8225622177124023),
 ("lucifer's", 0.8206878900527954)]

In [52]:
print("Death")
cbow_model.most_similar(positive=['death'],topn=5)

Death


[('sin', 0.7397729754447937),
 ('vengeance', 0.684362530708313),
 ('violence', 0.6662107110023499),
 ('destruction', 0.6469458341598511),
 ('greed', 0.6430654525756836)]

In [25]:
print("Feel")
sg_model.most_similar(positive=['feel'],topn=5)

[('feels', 0.8403787016868591),
 ('uncomfortable', 0.8059965372085571),
 ('uneasy', 0.8043667078018188),
 ('touch', 0.8012054562568665),
 ('uninspired', 0.7966713309288025)]

In [55]:
print("Feel")
cbow_model.most_similar(positive=['feel'],topn=5)

Feel


[('feels', 0.6813862323760986),
 ('feeling', 0.6737538576126099),
 ('touch', 0.6611270904541016),
 ('know', 0.6428345441818237),
 ('see', 0.6201209425926208)]

In [56]:
print('Time')
sg_model.most_similar(positive=['time'],topn=5)

Time


[("now's", 0.8421967029571533),
 ("'cus", 0.8407679200172424),
 ('procardia', 0.834004819393158),
 ("waiting's", 0.8211889266967773),
 ('celebate', 0.8184973001480103)]

In [57]:
print('Time')
cbow_model.most_similar(positive=['time'],topn=5)

Time


[('moment', 0.7394558787345886),
 ('day', 0.7283280491828918),
 ('chance', 0.7216507792472839),
 ('way', 0.6558200120925903),
 ('life', 0.6341539025306702)]

In [59]:
print('Baby')
sg_model.most_similar(positive=['baby'],topn=5)

Baby


[('babe', 0.9389967918395996),
 ('darlin', 0.8708404898643494),
 ('ooh', 0.855905294418335),
 ('oohhh', 0.8557827472686768),
 ("darlin'", 0.8471555113792419)]

In [58]:
print('Baby')
cbow_model.most_similar(positive=['baby'],topn=5)

Baby


[('babe', 0.8702223300933838),
 ("darlin'", 0.8207559585571289),
 ('oh', 0.8153826594352722),
 ('ooh', 0.8147225379943848),
 ('yeah', 0.7809575200080872)]

In [60]:
print('Money')
sg_model.most_similar(positive=['money'],topn=5)

Money


[('cash', 0.8603858947753906),
 ('dough', 0.8239105939865112),
 ('bank', 0.8236185312271118),
 ('income', 0.815504252910614),
 ('sadat', 0.8130607604980469)]

In [62]:
print('Money')
cbow_model.most_similar(positive=['money'],topn=5)

Money


[('cash', 0.7845544219017029),
 ('dough', 0.7241089940071106),
 ("seinin'", 0.6978685855865479),
 ('drugs', 0.6491255760192871),
 ('business', 0.6480509042739868)]

In [63]:
print('Happy')
sg_model.most_similar(positive=['happy'],topn=5)

Happy


[('unhappy', 0.8431247472763062),
 ('glad', 0.8085960149765015),
 ('endings', 0.780279815196991),
 ('stupidly', 0.7631745338439941),
 ('birthday', 0.7594167590141296)]

In [64]:
print('Happy')
cbow_model.most_similar(positive=['happy'],topn=5)

Happy


[('unhappy', 0.7206338047981262),
 ('glad', 0.7131701111793518),
 ('sad', 0.6280268430709839),
 ('good', 0.6064357161521912),
 ('lucky', 0.5828874111175537)]

In [65]:
print('Strong')
sg_model.most_similar(positive=['strong'],topn=5)

Strong


[('secure', 0.8102555274963379),
 ('weak', 0.805472731590271),
 ('trusting', 0.7850545644760132),
 ('enough', 0.7806623578071594),
 ('persuade', 0.7806413173675537)]

In [66]:
print('Strong')
cbow_model.most_similar(positive=['strong'],topn=5)

Strong


[('weak', 0.7337862253189087),
 ('alive', 0.6747564077377319),
 ('proud', 0.6446452736854553),
 ('satisfied', 0.5962284207344055),
 ('still', 0.5861331820487976)]

In [5]:
vocab = list(sg_model.wv.vocab)
X = sg_model[vocab]

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

In [None]:
df = pd.concat([pd.DataFrame(X_tsne), pd.Series(vocab)], axis =1)
df.columns = ['x', 'y', 'word']

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(df['x'], df['y'])

In [None]:
for i, txt in enumerate(df['word']):
    ax.annotate(txt, (df['x'].iloc[i], df['y'].iloc[i]))

In [None]:
plt.show()