In [1]:
import numpy as np
import tensorflow as tf
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

  from ._conv import register_converters as _register_converters


## Word2Vec Custom Implementation

In [2]:
text_corpus = "I am going to India . I am going to Bharat . I will be eating coffee . I will be drinking coffee ."
text_corpus = text_corpus.lower()

In [3]:
words = [word for word in text_corpus.split() if word!='.']
words = set(words)

In [4]:
words

{'am',
 'be',
 'bharat',
 'coffee',
 'drinking',
 'eating',
 'going',
 'i',
 'india',
 'to',
 'will'}

### Word2Int & Int2Word

In [5]:
word2int = {}
int2word = {}
corpus_size = len(words)

for index,word in enumerate(words):
    word2int[word] = index
    int2word[index] = word
    
print(word2int['india'])
print(int2word[7])

2
drinking


In [6]:
word2int

{'going': 0,
 'to': 1,
 'india': 2,
 'i': 3,
 'am': 4,
 'eating': 5,
 'coffee': 6,
 'drinking': 7,
 'be': 8,
 'bharat': 9,
 'will': 10}

In [7]:
sentences = []
[sentences.append(sentence.split()) for sentence in text_corpus.split('.') if sentence]
print(sentences)

[['i', 'am', 'going', 'to', 'india'], ['i', 'am', 'going', 'to', 'bharat'], ['i', 'will', 'be', 'eating', 'coffee'], ['i', 'will', 'be', 'drinking', 'coffee']]


### Getting the context words for a given target word of window size 2

In [8]:
window_size = 2
final_data = []

for sent in sentences:
    for index,word in enumerate(sent):
        for context_word in sent[max(index - window_size,0) : min(index + window_size, len(sent)) + 1]:
            if context_word != word:
                final_data.append([word,context_word])

final_data

[['i', 'am'],
 ['i', 'going'],
 ['am', 'i'],
 ['am', 'going'],
 ['am', 'to'],
 ['going', 'i'],
 ['going', 'am'],
 ['going', 'to'],
 ['going', 'india'],
 ['to', 'am'],
 ['to', 'going'],
 ['to', 'india'],
 ['india', 'going'],
 ['india', 'to'],
 ['i', 'am'],
 ['i', 'going'],
 ['am', 'i'],
 ['am', 'going'],
 ['am', 'to'],
 ['going', 'i'],
 ['going', 'am'],
 ['going', 'to'],
 ['going', 'bharat'],
 ['to', 'am'],
 ['to', 'going'],
 ['to', 'bharat'],
 ['bharat', 'going'],
 ['bharat', 'to'],
 ['i', 'will'],
 ['i', 'be'],
 ['will', 'i'],
 ['will', 'be'],
 ['will', 'eating'],
 ['be', 'i'],
 ['be', 'will'],
 ['be', 'eating'],
 ['be', 'coffee'],
 ['eating', 'will'],
 ['eating', 'be'],
 ['eating', 'coffee'],
 ['coffee', 'be'],
 ['coffee', 'eating'],
 ['i', 'will'],
 ['i', 'be'],
 ['will', 'i'],
 ['will', 'be'],
 ['will', 'drinking'],
 ['be', 'i'],
 ['be', 'will'],
 ['be', 'drinking'],
 ['be', 'coffee'],
 ['drinking', 'will'],
 ['drinking', 'be'],
 ['drinking', 'coffee'],
 ['coffee', 'be'],
 ['coffee

In [9]:
def one_hot_encoding(word_index,size):
    one_hot = np.zeros(size)
    one_hot[word_index] = 1
    return one_hot

### Splitting into train and test data

In [10]:
x_train = [] 
y_train = [] 

for words in final_data:
    x_train.append(one_hot_encoding(word2int[words[0]],corpus_size))
    y_train.append(one_hot_encoding(word2int[words[1]],corpus_size))

In [11]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [12]:
x_train[:3]

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]])

In [13]:
x_train.shape

(56, 11)

### Building the Neural Network model and Training the data using Cross Entropy Loss as loss function

In [14]:
x_data = tf.placeholder(tf.float32, shape=(None, corpus_size))
y_data = tf.placeholder(tf.float32, shape=(None, corpus_size))

In [15]:
embedding_size = 5

w1 = tf.Variable(tf.random_normal([corpus_size, embedding_size]))
b1 = tf.Variable(tf.random_normal([embedding_size]))
hidden_weights = tf.add(tf.matmul(x_data,w1), b1)

In [16]:
w2 = tf.Variable(tf.random_normal([embedding_size, corpus_size]))

b2 = tf.Variable(tf.random_normal([corpus_size]))

prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_weights, w2), b2))

In [17]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

# Loss Function

cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_data * tf.log(prediction), reduction_indices=[1]))

# Training Step
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

num_iterations = 1000

for _ in range(num_iterations):
    sess.run(train_step, feed_dict={x_data: x_train, y_data: y_train})
    print('Loss is : ', sess.run(cross_entropy_loss, feed_dict={x_data: x_train, y_data: y_train}))

Loss is :  4.4782166
Loss is :  4.1746984
Loss is :  3.9502368
Loss is :  3.776115
Loss is :  3.6343653
Loss is :  3.5150752
Loss is :  3.4126208
Loss is :  3.3234692
Loss is :  3.2451377
Loss is :  3.1757252
Loss is :  3.113708
Loss is :  3.0578396
Loss is :  3.0071018
Loss is :  2.9606655
Loss is :  2.9178581
Loss is :  2.8781388
Loss is :  2.8410704
Loss is :  2.806301
Loss is :  2.773544
Loss is :  2.7425656
Loss is :  2.7131734
Loss is :  2.6852067
Loss is :  2.6585307
Loss is :  2.6330316
Loss is :  2.608612
Loss is :  2.585188
Loss is :  2.5626855
Loss is :  2.5410411
Loss is :  2.5201976
Loss is :  2.5001054
Loss is :  2.4807184
Loss is :  2.4619968
Loss is :  2.4439034
Loss is :  2.426405
Loss is :  2.409471
Loss is :  2.3930728
Loss is :  2.3771853
Loss is :  2.361784
Loss is :  2.346846
Loss is :  2.332352
Loss is :  2.318281
Loss is :  2.304615
Loss is :  2.2913375
Loss is :  2.2784314
Loss is :  2.2658823
Loss is :  2.253675
Loss is :  2.241796
Loss is :  2.2302327
Loss is

Loss is :  1.4240328
Loss is :  1.4237045
Loss is :  1.4233772
Loss is :  1.423051
Loss is :  1.4227257
Loss is :  1.4224014
Loss is :  1.422078
Loss is :  1.4217557
Loss is :  1.4214342
Loss is :  1.4211137
Loss is :  1.4207944
Loss is :  1.4204757
Loss is :  1.420158
Loss is :  1.4198414
Loss is :  1.4195257
Loss is :  1.419211
Loss is :  1.418897
Loss is :  1.4185841
Loss is :  1.4182721
Loss is :  1.417961
Loss is :  1.4176509
Loss is :  1.4173418
Loss is :  1.4170333
Loss is :  1.4167261
Loss is :  1.4164194
Loss is :  1.416114
Loss is :  1.4158094
Loss is :  1.4155055
Loss is :  1.4152027
Loss is :  1.4149007
Loss is :  1.4145995
Loss is :  1.4142992
Loss is :  1.4140002
Loss is :  1.4137018
Loss is :  1.4134042
Loss is :  1.4131075
Loss is :  1.4128116
Loss is :  1.4125167
Loss is :  1.4122227
Loss is :  1.4119295
Loss is :  1.4116372
Loss is :  1.4113457
Loss is :  1.4110552
Loss is :  1.4107654
Loss is :  1.4104764
Loss is :  1.4101886
Loss is :  1.4099013
Loss is :  1.4096148

In [18]:
print(sess.run(w1))
print('\n\n************\n')
print(sess.run(b1))

[[-0.9851225   0.12291209 -0.8376613   0.06456982  0.32501528]
 [ 0.19741191 -0.5658716  -2.242484   -0.01379101 -0.4817157 ]
 [-1.7173529  -1.6941004  -0.05506314 -0.23726052 -0.55309945]
 [ 0.65868753 -3.229768   -0.30366862  1.6228232   0.79802424]
 [-1.731307   -0.09106921 -0.71565187 -1.7563089  -0.86405456]
 [ 0.6670963   0.9820797   0.8369455   1.4376696   0.38501337]
 [ 0.40080607 -1.7338216   2.910719    0.35715652  1.1951851 ]
 [ 0.6131543  -0.2567973   0.6077233   1.1783066   0.5549271 ]
 [ 1.6436603   1.613455    1.0681916   0.28381637 -0.2829471 ]
 [-1.1964473  -1.3388971   0.01153098 -0.4194493  -1.2240324 ]
 [-0.7099848   0.18468183  1.8527306   0.07417105  1.7195776 ]]


************

[-0.8675415   0.40560988 -0.74447423 -0.7293968   1.744267  ]


In [19]:
vec = sess.run(w1+b1)
vec

array([[-1.852664  ,  0.52852196, -1.5821356 , -0.664827  ,  2.0692823 ],
       [-0.6701296 , -0.16026172, -2.9869583 , -0.74318784,  1.2625513 ],
       [-2.5848944 , -1.2884905 , -0.79953736, -0.96665734,  1.1911676 ],
       [-0.20885396, -2.8241582 , -1.0481429 ,  0.8934264 ,  2.5422912 ],
       [-2.5988486 ,  0.31454068, -1.4601262 , -2.4857059 ,  0.8802124 ],
       [-0.20044518,  1.3876896 ,  0.09247124,  0.7082728 ,  2.1292803 ],
       [-0.46673542, -1.3282118 ,  2.1662447 , -0.3722403 ,  2.9394522 ],
       [-0.2543872 ,  0.14881256, -0.13675094,  0.44890976,  2.299194  ],
       [ 0.7761188 ,  2.019065  ,  0.32371742, -0.44558045,  1.4613199 ],
       [-2.0639887 , -0.93328726, -0.73294324, -1.1488461 ,  0.5202346 ],
       [-1.5775263 ,  0.59029174,  1.1082563 , -0.65522575,  3.4638445 ]],
      dtype=float32)

In [20]:
print(vec[word2int['india'] ])

[-2.5848944  -1.2884905  -0.79953736 -0.96665734  1.1911676 ]


In [23]:
vec.shape

(11, 5)

### Calculating the Average word2vec for sentences

In [25]:
sent_vectors = []

for sent in sentences:
    sent_vect = np.zeros(5)
    cnt = 0
    for index,word in enumerate(sent):
        word_vect = vec[word2int[word]]
        sent_vect += word_vect
        cnt+=1
    sent_vect /= cnt
    sent_vectors.append(sent_vect)

In [26]:
sent_vectors

[array([-1.58307811, -0.68596956, -1.57538005, -0.79339032,  1.58910096]),
 array([-1.47889696, -0.61492891, -1.56206123, -0.82982808,  1.45491436]),
 array([-0.33548841, -0.03106475,  0.52850937,  0.02573054,  2.50723763]),
 array([-0.34627682, -0.27884015,  0.48266493, -0.02614207,  2.54122038])]

In [27]:
sent_vectors[0]

array([-1.58307811, -0.68596956, -1.57538005, -0.79339032,  1.58910096])

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
cosine_similarity(sent_vectors[0].reshape(-1, 1),sent_vectors[1].reshape(-1, 1))

array([[ 1.,  1.,  1.,  1., -1.],
       [ 1.,  1.,  1.,  1., -1.],
       [ 1.,  1.,  1.,  1., -1.],
       [ 1.,  1.,  1.,  1., -1.],
       [-1., -1., -1., -1.,  1.]])

#### Similarity between Sentence 1 and 2

In [51]:
from scipy import spatial
sim1 = 1 - spatial.distance.cosine(sent_vectors[0], sent_vectors[1])
sim1

0.9990372628875064

#### Similarity between Sentence 3 and 4

In [52]:
sim2 = 1 - spatial.distance.cosine(sent_vectors[2], sent_vectors[3])
sim2

0.9951470596701174

**We can see here that as the two sentences "I am going to India" and "I am going to Bharat" differ only by one word the cosine similarity score of the two sentences is very close to one.**

**Similarly the cosine similarity score of the two sentences "I will be eating coffee" and "I will be drinking coffee" are also close to one as they also differ only by one word.**

**If the text corpus is large, the word embeddings will be even more accurate and the similarity scores will improve for new sentences.**

## Gensim Implementation of Word2Vec

In [32]:
w2v_model = Word2Vec(sentences,min_count=1,size=5,window=2)

In [33]:
w2v_words = list(w2v_model.wv.vocab)

In [34]:
w2v_words

['i',
 'am',
 'going',
 'to',
 'india',
 'bharat',
 'will',
 'be',
 'eating',
 'coffee',
 'drinking']

### Calculating the Average word2vec for sentences

In [35]:
sent_vectors_w2v = []

for sent in sentences:
    sent_vect = np.zeros(5)
    cnt = 0
    for index,word in enumerate(sent):
        word_vect = w2v_model.wv[word]
        sent_vect += word_vect
        cnt+=1
    sent_vect /= cnt
    sent_vectors_w2v.append(sent_vect)

In [36]:
sent_vectors_w2v

[array([ 0.00962927, -0.03535876,  0.0057662 ,  0.02018384, -0.02800476]),
 array([ 0.00940491, -0.0367009 ,  0.01115323,  0.0024141 , -0.03248497]),
 array([ 0.00510323, -0.01051808,  0.01030579,  0.00431587, -0.01589096]),
 array([-0.01288132, -0.03627979,  0.01526736, -0.01725841, -0.01075622])]

In [53]:
cosine_similarity(sent_vectors_w2v[0].reshape(-1, 1),sent_vectors_w2v[1].reshape(-1, 1))

array([[ 1., -1.,  1.,  1., -1.],
       [-1.,  1., -1., -1.,  1.],
       [ 1., -1.,  1.,  1., -1.],
       [ 1., -1.,  1.,  1., -1.],
       [-1.,  1., -1., -1.,  1.]])

In [55]:
a2 = 1 - spatial.distance.cosine(sent_vectors_w2v[0], sent_vectors_w2v[1])
a2

0.9293752410251761

**Here we can see that the cosine similarity score is lesser than the custom implementation but if the text corpus size is increased, the Gensim Word2Vec model tends to learn better word embeddings and gives good similarity scores. **