In [None]:
# Python | Word Embedding using Word2Vec

In [78]:
# Python program to generate word vectors using Word2Vec 
  
# importing all necessary modules 
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import Word2Vec 
  
#  Reads ‘alice.txt’ file 
txt = open('data/alice_in_wonderland.txt', 'r', encoding='utf-8').read()
  
# Replaces escape character with space 
f = txt.replace("\n", " ") 
  
data = []
# iterate through each sentence in the file 
for i in sent_tokenize(f): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

# CBOW model

<b>CBOW</b> (Continuous Bag of Words) : CBOW model predicts the current word given context words within specific window. The input layer contains the context words and the output layer contains the current word. The hidden layer contains the number of dimensions in which we want to represent current word present at the output layer.

![](data/img/cbow-1.png)

In [79]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 
  
# Print results 
print("Cosine similarity between 'alice' " + 
               "and 'wonderland' - CBOW : ", 
    model1.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " +
                 "and 'machines' - CBOW : ", 
      model1.similarity('alice', 'machines')) 

Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.9995018
Cosine similarity between 'alice' and 'machines' - CBOW :  0.96151334


# Skip Gram Model

<b>Skip Gram :</b> Skip gram predicts the surrounding context words within specific window given current word. The input layer contains the current word and the output layer contains the context words. The hidden layer contains the number of dimensions in which we want to represent current word present at the input layer.

![](data/img/skip_gram.png)

In [80]:
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 1) 
  
# Print results 
print("Cosine similarity between 'alice' " +
          "and 'wonderland' - Skip Gram : ", 
    model2.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " +
            "and 'machines' - Skip Gram : ", 
      model2.similarity('alice', 'machines')) 


Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.8744346
Cosine similarity between 'alice' and 'machines' - Skip Gram :  0.8473991


In [81]:
words = list(model1.wv.vocab)

In [82]:
words[:5]

['the', 'project', 'gutenberg', 'ebook', 'of']

# Save and load models

In [83]:
# Save the model in binary format
	
model1.save('model/CBOW_model.bin')

In [84]:
# Save in text format
model1.wv.save_word2vec_format('model/CBOW_model.txt', binary=False)

In [85]:
#Load model
model = Word2Vec.load('model/CBOW_model.bin')

In [86]:
print(model['alice'].shape)
model['alice'] - model['wonderland']

(100,)


array([ 0.10204594,  0.18292864,  0.87800646, -0.38213748,  0.2672963 ,
       -0.32852837,  0.17967647,  0.10020731, -0.27500832,  0.34737298,
       -0.40543127,  0.08899716, -0.2713697 , -0.7803048 , -0.67667294,
        0.6250244 ,  0.0229868 ,  0.14890417, -0.14119212, -0.04553616,
       -0.20357025, -0.03966459, -0.4305795 ,  0.3788454 ,  0.07458749,
       -0.2830592 , -0.025791  ,  0.00518946, -0.05386501, -0.35998458,
       -0.1914014 , -0.20371711, -0.05937751,  0.05216431,  0.25186336,
       -0.1403319 ,  0.6124175 , -0.28204593, -0.19261882,  0.20210662,
        0.40035596, -0.31916404,  0.14216146, -0.03328944, -0.22096813,
       -0.21119332, -0.08322362,  0.08064889, -0.36361706, -0.20035002,
       -0.39023587, -0.0895891 ,  0.29216364,  0.00807368, -0.12038402,
        0.130752  , -0.14050558,  0.19581953,  0.01237107, -0.03921397,
        0.13041776, -0.29148465, -0.06576138, -0.45159632,  0.05770196,
       -0.26694065,  0.5139964 , -0.3004852 ,  0.06345735,  0.06

In [87]:
print(model.similarity('alice', 'duck'))
print(model.similarity('alice', 'hair'))
print(model.similarity('alice', 'wonderland'))

0.7999394
0.99836755
0.9995018


In [88]:
result = model.most_similar(positive=['alice', 'wonderland'], negative=['rabbit'], topn=5)
print(result)

[('large', 0.9994753003120422), ('my', 0.9994434714317322), ('say', 0.9994432926177979), ('me', 0.9994370341300964), (')', 0.9994361400604248)]


In [89]:
result = model.most_similar(positive=['alice'], negative=['wonderland'], topn=5)
print(result)

[('picture', 0.22679075598716736), ('contempt', 0.21095998585224152), ('backs', 0.2065446376800537), ('jack-in-the-box', 0.19726744294166565), ('reasons', 0.191685751080513)]


# Load text model

In [95]:
from gensim.models import KeyedVectors
# load the text model
filename = 'model/CBOW_model.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [99]:
model.similarity('alice','wonderland')

0.9995018