# Word2Vec Analysis

In [1]:
import numpy as np
import pandas as pd 
data = pd.read_csv('songdata.csv')
data.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
from numba import cuda

In [5]:
l = list(data['text'])
l[0]

"Look at her face, it's a wonderful face  \nAnd it means something special to me  \nLook at the way that she smiles when she sees me  \nHow lucky can one fellow be?  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?  \n  \nAnd when we go for a walk in the park  \nAnd she holds me and squeezes my hand  \nWe'll go on walking for hours and talking  \nAbout all the things that we plan  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?\n\n"

In [32]:
import string
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.replace("\n","")
    return txt 

corpus = [clean_text(x) for x in l]
corpus[0]

'look at her face its a wonderful face  and it means something special to me  look at the way that she smiles when she sees me  how lucky can one fellow be    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do    and when we go for a walk in the park  and she holds me and squeezes my hand  well go on walking for hours and talking  about all the things that we plan    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do'

In [27]:
len(corpus)

57650

In [39]:
len(corpus[0])

153

In [34]:
for i in range(0,len(corpus)):
    corpus[i] = corpus[i].split(" ")
    corpus[i] = [x for x in corpus[i] if x]

In [37]:
len(corpus)

57650

In [38]:
len(corpus[0])

153

In [42]:
import gensim  
from gensim.models import Word2Vec

In [44]:
model = Word2Vec(corpus, min_count=1, size=20)

In [47]:
words = list(model.wv.vocab)
len(words)

103134

## Vectorized form of a word

In [67]:
model.wv.__getitem__('word')

array([-3.36672044,  1.6319958 ,  2.59266067,  0.55317771, -1.66056049,
        0.9137997 ,  1.28704381, -4.66750097, -5.61881304,  2.86073041,
        1.02189016,  3.89307022, -0.10596813,  0.27953875, -3.56011724,
        2.0013566 ,  1.16434729,  0.37650934, -0.99170995,  3.75102377], dtype=float32)

In [66]:
model.wv.most_similar('love')

[('loving', 0.8899664878845215),
 ('hope', 0.874354362487793),
 ('mine', 0.8602503538131714),
 ('life', 0.8560549020767212),
 ('believe', 0.8558452725410461),
 ('true', 0.8549146056175232),
 ('dream', 0.8503168821334839),
 ('raftersred', 0.8391464352607727),
 ('everything', 0.8320201635360718),
 ('promise', 0.8309168815612793)]

In [68]:
model.wv.most_similar('word')

[('story', 0.8705061674118042),
 ('book', 0.8008494973182678),
 ('name', 0.7951470613479614),
 ('hatform', 0.7924187183380127),
 ('letter', 0.7804446816444397),
 ('thing', 0.775516927242279),
 ('vow', 0.7367196679115295),
 ('voice', 0.7348980903625488),
 ('request', 0.7313973903656006),
 ('gift', 0.7308410406112671)]

In [69]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.8560591340065002),
 ('born', 0.7943884134292603),
 ('lamb', 0.7876380681991577),
 ('christ', 0.7743855118751526),
 ('child', 0.7737903594970703),
 ('jesus', 0.7685632705688477),
 ('legend', 0.7563323974609375),
 ('prince', 0.7492373585700989),
 ('iacocas', 0.7083836197853088),
 ('god', 0.7039011120796204)]

### The generic prediction works

In [71]:
model.wv.most_similar(positive=['love', 'day'], negative=['hate'])

[('night', 0.8698436617851257),
 ('moment', 0.839569091796875),
 ('morning', 0.7940908670425415),
 ('time', 0.7896767854690552),
 ('lifetime', 0.7751429080963135),
 ('dawn', 0.7643657326698303),
 ('year', 0.7614933252334595),
 ('dream', 0.7571679353713989),
 ('train', 0.7457009553909302),
 ('througth', 0.739289402961731)]

## Love to hate is like day to night..... Well well well....I'm pretty excited with the results

In [76]:
## Word that doesn't belong there
model.wv.doesnt_match("life hope dream spirit death".split())

'death'

## NOICE  

In [None]:
from sklearn.decomposition import PCA, TruncatedSVD
from matplotlib import pyplot

% matplotlib inline

X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

  
